knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(fig.width=6, fig.height=4)
options(scipen = 999) ## disable scientific notation 4.835312e-04 --> 0.0004835312
# install.packages("alr3") ## note - you only have to do this once
library(alr3)
library(ggplot2)
library(dplyr)
library(tidyr)
library(broom)
library(splines)
library(gridExtra)
library(reshape)
library(glmnet)
library(MASS)
library(mgcv)
library(lme4)
library(gee)
library(readr)
library(boot)
library(SemiPar) # get data(libar)
Intro to regression
####################################################################
## clear work space
rm(list = ls())
## load heights data
data(heights)
####################################################################
## initial data looking
####################################################################
## some summary looks at the height data set
head(heights)
## Mheight Dheight
## 1 59.7 55.1
## 2 58.2 56.5
## 3 60.6 56.0
## 4 60.7 56.8
## 5 61.8 56.0
## 6 55.5 57.9
heights = tbl_df(heights)
heights
## # A tibble: 1,375 x 2
## Mheight Dheight
## <dbl> <dbl>
## 1 59.7 55.1
## 2 58.2 56.5
## 3 60.6 56
## 4 60.7 56.8
## 5 61.8 56
## 6 55.5 57.9
## 7 55.4 57.1
## 8 56.8 57.6
## 9 57.5 57.2
## 10 57.3 57.1
## # ... with 1,365 more rows
dplyr::glimpse(heights)
## Observations: 1,375
## Variables: 2
## $ Mheight <dbl> 59.7, 58.2, 60.6, 60.7, 61.8, 55.5, 55.4, 56.8, 57.5, ...
## $ Dheight <dbl> 55.1, 56.5, 56.0, 56.8, 56.0, 57.9, 57.1, 57.6, 57.2, ...
str(heights)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1375 obs. of 2 variables:
## $ Mheight: num 59.7 58.2 60.6 60.7 61.8 55.5 55.4 56.8 57.5 57.3 ...
## $ Dheight: num 55.1 56.5 56 56.8 56 57.9 57.1 57.6 57.2 57.1 ...
dim(heights)
## [1] 1375 2
## look at daughters' heights only
summary(heights[,2])
## Dheight
## Min. :55.10
## 1st Qu.:62.00
## Median :63.60
## Mean :63.75
## 3rd Qu.:65.60
## Max. :73.10
ggplot(heights, aes(x = Dheight)) +
geom_histogram(fill = "#0000FF", alpha = .5, color = "black") +
labs(x = "Daughter Height", y = "Count", title = "Histogram of Daughter Heights")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## plot daughter height against mother height
ggplot(heights, aes(x = Mheight, y = Dheight)) +
geom_point(color = "red") +
labs(x = "Mother Height", y = "Daughter Height")

####################################################################
## some "regression" analyses
####################################################################
plot0 = ggplot(heights, aes(x = Mheight, y = Dheight)) +
geom_point(color = "red") +
labs(x = "Mother Height", y = "Daughter Height")
## order predictors and outcome
heights = arrange(heights, Mheight)
x = heights$Mheight
y = heights$Dheight
## f(x) = E(y)
fx = mean(y)
plot1 = plot0 + geom_hline(aes(yintercept = fx), color = "blue", size =1.2)
plot1

## interpolation
fx = y
plot2 = plot0 + geom_path(aes(x = x, y = fx), color = "blue")
plot2

## bin means
x.bin = as.numeric(cut(x, breaks = 5))
fx = sapply(x.bin, function(u) mean(y[which(x.bin == u)]))
plot3 = plot0 + geom_path(aes(x = x, y = fx), color = "blue", size = 1.2)
plot3

## smooth curve 1
fit.loess = loess(y ~ x, span = .15)
fx = predict(fit.loess, data.frame(x = x))
plot4 = plot0 + geom_path(aes(x = x, y = fx), color = "blue", size = 1.2)
plot4

## smooth curve 2
fit.loess = loess(y ~ x, span = .75)
fx = predict(fit.loess, data.frame(x = x))
plot5 = plot0 + geom_path(aes(x = x, y = fx), color = "blue", size = 1.2)
plot5

## line 1
fx = 40 + .4 * x
plot6 = plot0 + geom_path(aes(x = x, y = fx), color = "blue", size = 1.2)
plot6

## line 2
fit.lm = lm(y~x)
fx = predict(fit.lm, data.frame(x = x))
plot7 = plot0 + geom_path(aes(x = x, y = fx), color = "blue", size = 1.2)
plot7

## plot0 + stat_smooth(method = "lm")
####################################################################
## simple linear regression
####################################################################
linmod = lm(Dheight ~ Mheight, data = heights)
summary(linmod)
##
## Call:
## lm(formula = Dheight ~ Mheight, data = heights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.397 -1.529 0.036 1.492 9.053
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.91744 1.62247 18.44 <0.0000000000000002 ***
## Mheight 0.54175 0.02596 20.87 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.266 on 1373 degrees of freedom
## Multiple R-squared: 0.2408, Adjusted R-squared: 0.2402
## F-statistic: 435.5 on 1 and 1373 DF, p-value: < 0.00000000000000022
broom::glance(linmod)
## # A tibble: 1 x 11
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## * <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.241 0.240 2.27 435. 3.22e-84 2 -3075. 6156. 6172.
## # ... with 2 more variables: deviance <dbl>, df.residual <int>
tidy(linmod)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 29.9 1.62 18.4 5.21e-68
## 2 Mheight 0.542 0.0260 20.9 3.22e-84
####################################################################
Simple linear regression and least squares properties
####################################################################
## plot for interpretation
####################################################################
set.seed(1)
data = data.frame(X = rnorm(30, 3, 3)) %>% mutate(Y = 2+.6*X +rnorm(30, 0, 1))
ggplot(data, aes(x = X, y = Y)) +
geom_smooth(method = 'lm', se = FALSE, size = 2) +
geom_point(alpha = .1) +
theme_bw() +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0)

####################################################################
## a few possible scatterplots
####################################################################
## plot 1
plot.data = data.frame(x = rnorm(100),
y = rnorm(100))
ggplot(plot.data, aes(x = x, y = y)) + geom_point()

## plot 2
plot.data = data.frame(x = rnorm(100)) %>% mutate(y = -2-2*x + rnorm(100, 0, .5))
ggplot(plot.data, aes(x = x, y = y)) + geom_point()

## plot 3
plot.data = data.frame(x = rnorm(100)) %>% mutate(y = -2+2*(x+1)^2 + rnorm(100, 0, 2))
ggplot(plot.data, aes(x = x, y = y)) + geom_point()

## plot 4
plot.data = mutate(plot.data, x = replace(x, 1, 4), y = replace(y, 1, 22), y = replace(y, order(x)[50], 15))
ggplot(plot.data, aes(x = x, y = y)) + geom_point()

####################################################################
## plot for interpretation -- this code uses "old fashioned" plotting
####################################################################
set.seed(1)
data = data.frame(X = rnorm(30, 3, 3)) %>% mutate(Y = 2+.6*X +rnorm(30, 0, 1))
data$fitted = lm(Y~X, data = data) %>% fitted
ggplot(data, aes(x = X, y = Y)) +
geom_smooth(method = 'lm', se = FALSE, size = 1) +
geom_point(color = "red", size = 2) +
geom_point(aes(y = fitted), size = 2) +
theme_bw() +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 0)

####################################################################
## recreate linear model output using formulas
####################################################################
set.seed(1)
data = data.frame(x = rnorm(30, 3, 3)) %>% mutate(y = 2+.6*x +rnorm(30, 0, 1))
## make a plot
ggplot(data, aes(x = x, y = y)) +
geom_point(color = "red", size = 3) +
geom_hline(aes(yintercept = mean(y)),
color = "blue", linetype = 2, size = 1) +
geom_smooth(method = "lm", se = FALSE,
color = "blue", size = 1)

linmod = lm(y~x, data = data)
summary(linmod)
##
## Call:
## lm(formula = y ~ x, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5202 -0.5050 -0.2297 0.5753 1.8534
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.08743 0.22958 9.092 0.00000000075297 ***
## x 0.61396 0.05415 11.338 0.00000000000561 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8084 on 28 degrees of freedom
## Multiple R-squared: 0.8211, Adjusted R-squared: 0.8148
## F-statistic: 128.6 on 1 and 28 DF, p-value: 0.000000000005612
tidy(linmod)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 2.09 0.230 9.09 7.53e-10
## 2 x 0.614 0.0542 11.3 5.61e-12
glance(linmod)
## # A tibble: 1 x 11
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## * <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.821 0.815 0.808 129. 5.61e-12 2 -35.2 76.3 80.5
## # ... with 2 more variables: deviance <dbl>, df.residual <int>
(beta1 = sum((x - mean(x))*(y - mean(y))) / sum((x - mean(x))^2))
## [1] 0.541747
(beta0 = mean(y) - beta1*mean(x))
## [1] 29.91744
## names() tells you what's in an object
names(linmod)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
linmod$residuals
## 1 2 3 4 5 6
## 1.2555987 -0.2398006 0.2933523 -0.2499462 -1.5201821 -0.5099489
## 7 8 9 10 11 12
## -0.5440273 -0.2195598 0.9465873 0.6466466 -0.3571673 -0.3990115
## 13 14 15 16 17 18
## 0.5936641 0.5201081 -0.8651956 -0.8349338 0.2359394 0.5996784
## 19 20 21 22 23 24
## -0.2760649 0.7269107 0.2302926 -0.7741079 0.2086757 -1.1753572
## 25 26 27 28 29 30
## 1.2777408 1.8534302 -0.4900165 -1.1118509 0.4604269 -0.2818813
linmod$fitted.values
## 1 2 3 4 5 6
## 2.7754640 4.2675708 2.3901878 6.8676466 4.5362366 2.4181112
## 7 8 9 10 11 12
## 4.8271096 5.2892309 4.9898445 3.3668300 6.7138498 4.6473676
## 13 14 15 16 17 18
## 2.7850662 -0.1499047 6.0013156 3.8465581 3.8995001 5.6677597
## 19 20 21 22 23 24
## 5.4419168 5.0232194 5.6219726 5.3699269 4.0666609 0.2651611
## 25 26 27 28 29 30
## 5.0709693 3.8259380 3.6423631 1.2203620 3.0486227 4.6991216
## names() on the summary of linmod
names(summary(linmod))
## [1] "call" "terms" "residuals" "coefficients"
## [5] "aliased" "sigma" "df" "r.squared"
## [9] "adj.r.squared" "fstatistic" "cov.unscaled"
summary(linmod)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0874344 0.22958105 9.092364 0.000000000752971072
## x 0.6139621 0.05415004 11.338166 0.000000000005611585
summary(linmod)$r.squared
## [1] 0.821148
## ANOVA table for linmod
anova(linmod)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x 1 84.02 84.020 128.55 0.000000000005612 ***
## Residuals 28 18.30 0.654
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
1 - 18.30 / (84.02 + 18.30)
## [1] 0.8211493
####################################################################
## influence of outliers on slope
####################################################################
## setting seed ensures reproducibility of random data
set.seed(1)
## get data, estimate linear model
data = data.frame(x = rnorm(30, 3, 3)) %>%
mutate(y = 2+.6*x + rnorm(30, 0, 1),
y = replace(y, 4, -2))
## make a plot
ggplot(data, aes(x = x, y = y)) +
geom_point(color = "red", size = 3) +
geom_smooth(method = "lm", se = FALSE,
color = "blue", size = 1, linetype = 2) +
geom_abline(intercept = coef(linmod)[1],
slope = coef(linmod)[2], color = "blue", size = 1)

## setting seed ensures reproducibility of random data
set.seed(1)
## get data, estimate linear model
data = data.frame(x = rnorm(30, 3, 3)) %>%
mutate(y = 2+.6*x + rnorm(30, 0, 1),
y = replace(y, 23, -2))
## make a plot
ggplot(data, aes(x = x, y = y)) +
geom_point(color = "red", size = 3) +
geom_smooth(method = "lm", se = FALSE,
color = "blue", size = 1, linetype = 2) +
geom_abline(intercept = coef(linmod)[1],
slope = coef(linmod)[2], color = "blue", size = 1)

####################################################################
multiple linear regression
####################################################################
## multiple linear regression
####################################################################
set.seed(1)
data.mlr = data.frame(age = rnorm(40, 66, 6),
sex = rep(c("male", "female"), each = 20)) %>%
mutate(weight = 6 + 2.5 * age + 20 * (sex == "male") + rnorm(40, 0, 6)) %>%
tbl_df
ggplot(data = data.mlr, aes(y=weight, x = age, color = sex)) +
geom_point(size = 3) +
geom_smooth(method = "lm", se = FALSE) +
theme_bw()

## second plot; lines only
ggplot(data = data.mlr, aes(y=weight, x = age, color = sex)) +
geom_smooth(method = "lm", se = FALSE) +
theme_bw()

## description, modeling
summary(data.mlr)
## age sex weight
## Min. :52.71 female:20 Min. :141.3
## 1st Qu.:63.60 male :20 1st Qu.:169.2
## Median :66.77 Median :183.2
## Mean :66.55 Mean :183.1
## 3rd Qu.:70.47 3rd Qu.:198.9
## Max. :75.57 Max. :218.3
linmod = lm(weight ~ age + sex, data = data.mlr)
tidy(linmod)
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -9.24 11.2 -0.828 4.13e- 1
## 2 age 2.74 0.168 16.3 1.81e-18
## 3 sexmale 19.9 1.77 11.3 1.53e-13
summary(linmod)
##
## Call:
## lm(formula = weight ~ age + sex, data = data.mlr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.2249 -3.7504 -0.4383 2.4337 12.4650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.2395 11.1602 -0.828 0.413
## age 2.7403 0.1681 16.297 < 0.0000000000000002 ***
## sexmale 19.9383 1.7665 11.287 0.000000000000153 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.551 on 37 degrees of freedom
## Multiple R-squared: 0.9224, Adjusted R-squared: 0.9182
## F-statistic: 220 on 2 and 37 DF, p-value: < 0.00000000000000022
head(data.mlr)
## # A tibble: 6 x 3
## age sex weight
## <dbl> <fct> <dbl>
## 1 62.2 male 181.
## 2 67.1 male 192.
## 3 61.0 male 183.
## 4 75.6 male 218.
## 5 68.0 male 192.
## 6 61.1 male 174.
model.matrix(linmod) %>% head
## (Intercept) age sexmale
## 1 1 62.24128 1
## 2 1 67.10186 1
## 3 1 60.98623 1
## 4 1 75.57168 1
## 5 1 67.97705 1
## 6 1 61.07719 1
tail(data.mlr)
## # A tibble: 6 x 3
## age sex weight
## <dbl> <fct> <dbl>
## 1 57.7 female 143.
## 2 63.5 female 167.
## 3 63.6 female 162.
## 4 65.6 female 170.
## 5 72.6 female 188.
## 6 70.6 female 179.
model.matrix(linmod) %>% tail
## (Intercept) age sexmale
## 35 1 57.73764 0
## 36 1 63.51003 0
## 37 1 63.63426 0
## 38 1 65.64412 0
## 39 1 72.60015 0
## 40 1 70.57905 0
####################################################################
## interactions
####################################################################
set.seed(1)
data.mlr = data.frame(age = rnorm(40, 66, 6),
sex = rep(c("male", "female"), each = 20)) %>%
mutate(weight = -20 +
2.5 * age - 60 * (sex == "male") +
2 * (sex == "male") * age +
rnorm(40, 0, 6)) %>%
tbl_df
ggplot(data = data.mlr, aes(y=weight, x = age, color = sex)) +
geom_point(size = 3) +
geom_smooth(method = "lm", se = FALSE) +
theme_bw()

## second plot; lines only
ggplot(data = data.mlr, aes(y=weight, x = age, color = sex)) +
geom_smooth(method = "lm", se = FALSE) +
theme_bw()

## description, modeling
summary(data.mlr)
## age sex weight
## Min. :52.71 female:20 Min. :115.3
## 1st Qu.:63.60 male :20 1st Qu.:145.0
## Median :66.77 Median :170.7
## Mean :66.55 Mean :184.2
## 3rd Qu.:70.47 3rd Qu.:224.8
## Max. :75.57 Max. :263.4
linmod = lm(weight ~ age * sex, data = data.mlr)
tidy(linmod)
## # A tibble: 4 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -33.5 16.3 -2.05 4.74e- 2
## 2 age 2.71 0.247 11.0 4.64e-13
## 3 sexmale -63.4 22.8 -2.78 8.53e- 3
## 4 age:sexmale 2.05 0.341 6.01 6.81e- 7
head(data.mlr)
## # A tibble: 6 x 3
## age sex weight
## <dbl> <fct> <dbl>
## 1 62.2 male 199.
## 2 67.1 male 220.
## 3 61.0 male 199.
## 4 75.6 male 263.
## 5 68.0 male 222.
## 6 61.1 male 191.
model.matrix(linmod) %>% head
## (Intercept) age sexmale age:sexmale
## 1 1 62.24128 1 62.24128
## 2 1 67.10186 1 67.10186
## 3 1 60.98623 1 60.98623
## 4 1 75.57168 1 75.57168
## 5 1 67.97705 1 67.97705
## 6 1 61.07719 1 61.07719
tail(data.mlr)
## # A tibble: 6 x 3
## age sex weight
## <dbl> <fct> <dbl>
## 1 57.7 female 117.
## 2 63.5 female 141.
## 3 63.6 female 136.
## 4 65.6 female 144.
## 5 72.6 female 162.
## 6 70.6 female 153.
model.matrix(linmod) %>% tail
## (Intercept) age sexmale age:sexmale
## 35 1 57.73764 0 0
## 36 1 63.51003 0 0
## 37 1 63.63426 0 0
## 38 1 65.64412 0 0
## 39 1 72.60015 0 0
## 40 1 70.57905 0 0
####################################################################
Polynomial and spline models
####################################################################
## nonlinear data
####################################################################
set.seed(1)
data.nonlin = data.frame(x = runif(100, 0, 1)) %>%
mutate(y = -30*(x-.5)^2 + 100*(x-.5)^4 + rnorm(100, 0, .3))
## data plot
ggplot(data.nonlin, aes(y=y, x=x)) + geom_point() + theme_bw()

####################################################################
## fit as a polynomial
####################################################################
data.nonlin = mutate(data.nonlin,
x.pow2 = x^2, x.pow3 = x^3, x.pow4 = x^4)
## fit data with quartic, plot results
quartfit = lm(y ~ x + x.pow2 + x.pow3 + x.pow4, data = data.nonlin)
broom::tidy(quartfit)
## # A tibble: 5 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.21 0.184 -6.59 2.40e- 9
## 2 x -22.0 2.39 -9.20 8.67e-15
## 3 x.pow2 130. 9.41 13.8 1.92e-24
## 4 x.pow3 -216. 13.9 -15.6 6.71e-28
## 5 x.pow4 108. 6.84 15.9 1.93e-28
mutate(data.nonlin, fitted = fitted(quartfit)) %>%
ggplot(., aes(y=y, x=x)) +
geom_point() +
geom_line(aes(y = fitted), color = "red") +
theme_bw()

## plot polynomial basis functions
data.nonlin %>%
dplyr::select(-y) %>%
mutate(pow.0 = 1, x.pow1 = x) %>%
gather(key = power, value = value, -x) %>%
mutate(power = factor(power)) %>%
ggplot(., aes(x = x, y = value)) +
geom_line() +
facet_grid(. ~ power, scales = "free") +
theme_bw()

## fit data with quadratic, plot results
quadfit = lm(y ~ x + x.pow2, data = data.nonlin)
mutate(data.nonlin, fitted = fitted(quadfit)) %>%
ggplot(., aes(y=y, x=x)) +
geom_point() +
geom_line(aes(y = fitted), color = "red") +
theme_bw()

## fit data with degree 20 poly, plot results
twentyfit = lm(y ~ poly(x, 20), data = data.nonlin)
mutate(data.nonlin, fitted = fitted(twentyfit)) %>%
ggplot(., aes(y=y, x=x)) +
geom_point() +
geom_line(aes(y = fitted), color = "red") +
theme_bw()

####################################################################
## fit as a piecewise linear
####################################################################
set.seed(1)
data.nonlin = data.frame(x = runif(100, 0, 1)) %>%
mutate(y = -30*(x-.5)^2 + 100*(x-.5)^4 + rnorm(100, 0, .3))
data.nonlin = mutate(data.nonlin,
spline_2 = (x - .2) * (x >= .2),
spline_5 = (x - .5) * (x >= .5),
spline_8 = (x - .8) * (x >= .8))
piecewise.fit = lm(y ~ x + spline_2 + spline_5 + spline_8, data = data.nonlin)
tidy(piecewise.fit)
## # A tibble: 5 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -2.06 0.164 -12.6 6.57e-22
## 2 x 0.285 1.05 0.272 7.86e- 1
## 3 spline_2 7.86 1.36 5.76 1.04e- 7
## 4 spline_5 -16.2 0.783 -20.6 6.78e-37
## 5 spline_8 7.95 1.29 6.18 1.56e- 8
mutate(data.nonlin, fitted = fitted(piecewise.fit)) %>%
ggplot(., aes(y=y, x=x)) + geom_point() +
geom_line(aes(y = fitted), color = "red") + theme_bw()

## plot piecewise basis functions
data.nonlin %>%
dplyr::select(-y) %>%
mutate(spline_int = 1, spline_0 = x) %>%
gather(key = power, value = value, -x) %>%
mutate(power = factor(power), power = relevel(power, ref = "spline_int")) %>%
ggplot(., aes(x = x, y = value)) + geom_line() +
facet_grid(. ~ power, scales = "free") +
theme_bw()

####################################################################
## fit as a bspline
####################################################################
set.seed(1)
data.nonlin = data.frame(x = runif(100, 0, 1)) %>%
mutate(y = -30*(x-.5)^2 + 100*(x-.5)^4 + rnorm(100, 0, .3))
data.nonlin = data.nonlin %>% bind_cols(., data.frame(ns(.[['x']], df = 5))) %>%
dplyr::rename(sp.1 = X1, sp.2 = X2, sp.3 = X3, sp.4 = X4, sp.5 = X5)
bspline.fit = lm(y ~ sp.1 + sp.2 + sp.3 + sp.4 + sp.5, data = data.nonlin)
tidy(bspline.fit)
## # A tibble: 6 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.95 0.142 -13.7 3.15e-24
## 2 sp.1 2.52 0.163 15.4 1.57e-27
## 3 sp.2 1.91 0.221 8.64 1.40e-13
## 4 sp.3 -0.365 0.158 -2.32 2.25e- 2
## 5 sp.4 -0.415 0.353 -1.17 2.44e- 1
## 6 sp.5 0.432 0.174 2.48 1.50e- 2
mutate(data.nonlin, fitted = fitted(bspline.fit)) %>%
ggplot(., aes(y=y, x=x)) + geom_point() +
geom_line(aes(y = fitted), color = "red") + theme_bw()

## plot bspline basis functions
data.nonlin %>%
dplyr::select(-y) %>%
mutate(sp.0 = 1) %>%
gather(key = power, value = value, -x) %>%
ggplot(., aes(x = x, y = value)) + geom_line() +
facet_grid(. ~ power, scales = "free") +
theme_bw()

####################################################################
Identifiability and collinearity
####################################################################
## load MLB data
####################################################################
## download data file
#download.file("http://www.openintro.org/stat/data/mlb11.RData", destfile = "mlb11.RData")
load("mlb11.RData")
## load data
data(heights)
mlb11<- mlb11 %>% tbl_df
mlb11
## # A tibble: 30 x 12
## team runs at_bats hits homeruns bat_avg strikeouts stolen_bases wins
## <fct> <int> <int> <int> <int> <dbl> <int> <int> <int>
## 1 Texa… 855 5659 1599 210 0.283 930 143 96
## 2 Bost… 875 5710 1600 203 0.28 1108 102 90
## 3 Detr… 787 5563 1540 169 0.277 1143 49 95
## 4 Kans… 730 5672 1560 129 0.275 1006 153 71
## 5 St. … 762 5532 1513 162 0.273 978 57 90
## 6 New … 718 5600 1477 108 0.264 1085 130 77
## 7 New … 867 5518 1452 222 0.263 1138 147 97
## 8 Milw… 721 5447 1422 185 0.261 1083 94 96
## 9 Colo… 735 5544 1429 163 0.258 1201 118 73
## 10 Hous… 615 5598 1442 95 0.258 1164 118 56
## # ... with 20 more rows, and 3 more variables: new_onbase <dbl>,
## # new_slug <dbl>, new_obs <dbl>
####################################################################
## make some plots
####################################################################
p1 = ggplot(mlb11, aes(x = runs, y = hits)) + geom_point() +
labs(x = "Runs", y = "Hits") + theme_bw()
p2 = ggplot(mlb11, aes(x = homeruns, y = hits)) + geom_point() +
labs(x = "Homeruns", y = "Hits") + theme_bw()
p3 = ggplot(mlb11, aes(x = stolen_bases, y = hits)) + geom_point() +
labs(x = "Stolen Bases", y = "Hits") + theme_bw()
grid.arrange(p1, p2, p3, ncol = 3)

####################################################################
## fit a model
####################################################################
linmod = lm(runs ~ at_bats + hits + homeruns + stolen_bases, data = mlb11)
tidy(linmod)
## # A tibble: 5 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 581. 526. 1.10 0.280
## 2 at_bats -0.202 0.117 -1.72 0.0971
## 3 hits 0.697 0.113 6.16 0.00000191
## 4 homeruns 1.25 0.159 7.87 0.0000000318
## 5 stolen_bases 0.523 0.169 3.10 0.00473
####################################################################
## fit a model "by hand"
####################################################################
X = cbind(1, mlb11$at_bats, mlb11$hits, mlb11$homeruns, mlb11$stolen_bases)
y = (mlb11$runs)
betaHat = solve(t(X) %*% X) %*% t(X) %*% y
betaHat
## [,1]
## [1,] 581.2109940
## [2,] -0.2023278
## [3,] 0.6974143
## [4,] 1.2535062
## [5,] 0.5229741
fitted = X %*% betaHat
sigmaHat = sqrt(t(y - fitted) %*% (y - fitted) / (30-4-1))
sigmaHat
## [,1]
## [1,] 26.84777
VarBeta = as.numeric(sigmaHat^2) * (solve(t(X) %*% X))
VarBeta
## [,1] [,2] [,3] [,4]
## [1,] 277103.547951 -60.92179961738 42.6907437568 -1.1735722100
## [2,] -60.921800 0.01377374118 -0.0108596986 0.0008804808
## [3,] 42.690744 -0.01085969863 0.0128013011 -0.0054905039
## [4,] -1.173572 0.00088048084 -0.0054905039 0.0253823891
## [5,] -5.003475 0.00008390983 0.0008247119 0.0017790736
## [,5]
## [1,] -5.00347547078
## [2,] 0.00008390983
## [3,] 0.00082471190
## [4,] 0.00177907356
## [5,] 0.02843657986
sqrt(diag(VarBeta))
## [1] 526.4062575 0.1173616 0.1131428 0.1593185 0.1686315
####################################################################
## regression analyses
####################################################################
linmod = lm(Dheight~Mheight, data = heights)
tidy(linmod)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 29.9 1.62 18.4 5.21e-68
## 2 Mheight 0.542 0.0260 20.9 3.22e-84
## plot daughter height against mother height
ggplot(heights, aes(x = Mheight, y = Dheight)) + geom_point(color = "red") +
labs(x = "Mother Height", y = "Daughter Height") +
geom_smooth(method = "lm", se = FALSE, size = 2) +
theme_bw()

####################################################################
## mother's height in meters rather than inches
####################################################################
heights = mutate(heights, Mheight_m = Mheight * .0254)
linmod = lm(Dheight~Mheight_m, data = heights)
tidy(linmod)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 29.9 1.62 18.4 5.21e-68
## 2 Mheight_m 21.3 1.02 20.9 3.22e-84
## plot daughter height against mother height
ggplot(heights, aes(x = Mheight_m, y = Dheight)) + geom_point(color = "red") +
labs(x = "Mother Height", y = "Daughter Height") +
geom_smooth(method = "lm", se = FALSE, size = 2) +
theme_bw()

####################################################################
## effects of collinearity
####################################################################
linmod.col = lm(Dheight ~ Mheight + Mheight_m, data = heights)
summary(linmod.col)
##
## Call:
## lm(formula = Dheight ~ Mheight + Mheight_m, data = heights)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.397 -1.529 0.036 1.492 9.053
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.91744 1.62247 18.44 <0.0000000000000002 ***
## Mheight 0.54175 0.02596 20.87 <0.0000000000000002 ***
## Mheight_m NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.266 on 1373 degrees of freedom
## Multiple R-squared: 0.2408, Adjusted R-squared: 0.2402
## F-statistic: 435.5 on 1 and 1373 DF, p-value: < 0.00000000000000022
X = as.matrix(cbind(1, dplyr::select(heights, Mheight, Mheight_m)))
#solve(t(X) %*% X)
## some measurement error on the meters
heights = mutate(heights, Mheight_m = Mheight_m + rnorm(1375, mean = 0, sd = .01))
summarize(heights, cor = cor(Mheight, Mheight_m))
## cor
## 1 0.9851668
X = as.matrix(cbind(1, dplyr::select(heights, Mheight, Mheight_m)))
solve(t(X) %*% X)
## 1 Mheight Mheight_m
## 1 0.512689796 -0.007347167 -0.03348367
## Mheight -0.007347167 0.004456172 -0.17082049
## Mheight_m -0.033483667 -0.170820488 6.74680836
ggplot(heights, aes(x = Mheight, y=Mheight_m)) + geom_point() +
labs(x = "Height in inches", y = "Height in meters") +
theme_bw()

linmod.me = lm(Dheight ~ Mheight + Mheight_m, data = heights)
tidy(linmod.me)
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 29.9 1.62 18.4 6.89e-68
## 2 Mheight 0.453 0.151 2.99 2.80e- 3
## 3 Mheight_m 3.50 5.89 0.595 5.52e- 1
####################################################################
Inference for MLR
####################################################################
## fit a model
####################################################################
linmod = lm(runs ~ at_bats + hits + homeruns + stolen_bases, data = mlb11)
summary(linmod)
##
## Call:
## lm(formula = runs ~ at_bats + hits + homeruns + stolen_bases,
## data = mlb11)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.868 -22.828 5.927 19.801 36.379
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 581.2110 526.4063 1.104 0.28006
## at_bats -0.2023 0.1174 -1.724 0.09706 .
## hits 0.6974 0.1131 6.164 0.0000019111 ***
## homeruns 1.2535 0.1593 7.868 0.0000000318 ***
## stolen_bases 0.5230 0.1686 3.101 0.00473 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.85 on 25 degrees of freedom
## Multiple R-squared: 0.9087, Adjusted R-squared: 0.894
## F-statistic: 62.17 on 4 and 25 DF, p-value: 0.00000000000126
## compare to a null model with two predictors
linmod.null1 = lm(runs ~ hits + homeruns, data = mlb11)
anova(linmod.null1, linmod)
## Analysis of Variance Table
##
## Model 1: runs ~ hits + homeruns
## Model 2: runs ~ at_bats + hits + homeruns + stolen_bases
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 27 27128
## 2 25 18020 2 9107.8 6.3178 0.006015 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## compare to a null model with everything but stolen_bases
linmod.null2 = lm(runs ~ at_bats + hits + homeruns, data = mlb11)
anova(linmod.null2, linmod)
## Analysis of Variance Table
##
## Model 1: runs ~ at_bats + hits + homeruns
## Model 2: runs ~ at_bats + hits + homeruns + stolen_bases
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 26 24953
## 2 25 18020 1 6932.7 9.618 0.004728 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## compare to a null model with only the intercept
linmod.null3 = lm(runs ~ 1, data = mlb11)
anova(linmod.null3, linmod)
## Analysis of Variance Table
##
## Model 1: runs ~ 1
## Model 2: runs ~ at_bats + hits + homeruns + stolen_bases
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 29 197281
## 2 25 18020 4 179261 62.174 0.00000000000126 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
####################################################################
## simulations looking at LSEs
####################################################################
## fix simulation elements
nrep = 10000
beta0 = 0
beta1 = 1
n = c(10, 100, 1000)
## matrices to hold results
NormLSEs = NonNormLSEs = data.frame(n10 = rep(NA, nrep),
n100 = rep(NA, nrep),
n1000 = rep(NA, nrep))
## simulate data; compute and save LSEs
for(samp.size in 1:3){
x = rnorm(n[samp.size], 0, 1)
for(i in 1:nrep){
y = beta0 + beta1 * x + rnorm(n[samp.size], 0, 1)
NormLSEs[i,samp.size] = lm(y~x)$coef[2]
}
}
for(samp.size in 1:3){
x = rnorm(n[samp.size], 0, 1)
for(i in 1:nrep){
y = beta0 + beta1 * x + (10/3 * (rbinom(n[samp.size], 1, .1)) - 1/3)
NonNormLSEs[i,samp.size] = lm(y~x)$coef[2]
}
}
## arrange and plot results of simulation for normal errors
NormLSEs = gather(NormLSEs, key = SampSize, value = LSE) %>%
mutate(Normalized = sqrt(rep(c(10,100,1000), each = nrep)) * (LSE - 1))
p1 = ggplot(NormLSEs, aes(LSE, fill = SampSize)) +
geom_density(alpha = .2) +
labs(title = "LSEs") +
theme_bw()
p2 = ggplot(NormLSEs, aes(Normalized, fill = SampSize)) +
geom_density(alpha = .2) +
labs(title = "Normalized LSEs") +
theme_bw()
grid.arrange(p1, p2, ncol = 2)

## arrange and plot results of simulation for non-normal errors
NonNormLSEs = gather(NonNormLSEs, key = SampSize, value = LSE) %>%
mutate(Normalized = sqrt(rep(c(10,100,1000), each = nrep)) * (LSE - 1))
p1 = ggplot(NonNormLSEs, aes(LSE, fill = SampSize)) +
geom_density(alpha = .2) +
labs(title = "LSEs") +
theme_bw()
p2 = ggplot(NonNormLSEs, aes(Normalized, fill = SampSize)) +
geom_density(alpha = .2) +
labs(title = "Normalized LSEs") +
theme_bw()
grid.arrange(p1, p2, ncol = 2)

####################################################################
## plot of FWER
####################################################################
alpha = .05
k = 1:100
fwer = 1-(1-alpha)^k
plot.dat = as.data.frame(cbind(k ,fwer))
ggplot(plot.dat, aes(x = k, y = fwer)) +
geom_path() +
geom_hline(yintercept = 1)

####################################################################
## nonlinear example
####################################################################
set.seed(1)
data.nonlin = data.frame(x = runif(100, 0, 1)) %>%
mutate(y = -30*(x-.5)^2 + 100*(x-.5)^4 + rnorm(100, 0, .3))
## data plot
ggplot(data.nonlin, aes(y=y, x=x)) + geom_point() + theme_bw()

PW.basis = sapply(seq(.1, .9, by = 0.05),
function(u) (data.nonlin$x - u) * (data.nonlin$x >= u))
colnames(PW.basis) = paste0("spline_", substr(as.character(seq(.1, .9, by = 0.05)), 3, 5))
PW.basis = as.data.frame(PW.basis)
data.nonlin = bind_cols(data.nonlin, PW.basis)
## fit three models
piecewise.underfit = lm(y ~ x, data = data.nonlin)
tidy(piecewise.underfit)
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.08 0.189 -5.68 0.000000139
## 2 x -0.251 0.325 -0.772 0.442
piecewise.fit = lm(y ~ x + spline_15 + spline_5 + spline_9, data = data.nonlin)
tidy(piecewise.fit)
## # A tibble: 5 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.78 0.180 -9.91 2.61e-16
## 2 x -3.51 1.49 -2.36 2.06e- 2
## 3 spline_15 11.1 1.72 6.46 4.39e- 9
## 4 spline_5 -14.8 0.584 -25.3 5.95e-44
## 5 spline_9 20.6 2.76 7.45 4.24e-11
piecewise.overfit = lm(y ~ x + spline_1 + spline_15 + spline_2 +
spline_25 + spline_3 + spline_35 + spline_4 +
spline_45 + spline_5 + spline_55 + spline_6 +
spline_65 + spline_7 + spline_75 + spline_8 +
spline_85 + spline_9, data = data.nonlin)
tidy(piecewise.overfit)
## # A tibble: 19 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -1.78 0.241 -7.37 1.29e-10
## 2 x -3.36 3.43 -0.981 3.29e- 1
## 3 spline_1 -5.25 9.17 -0.572 5.69e- 1
## 4 spline_15 23.0 12.5 1.84 6.87e- 2
## 5 spline_2 -12.1 9.51 -1.28 2.06e- 1
## 6 spline_25 10.6 9.73 1.09 2.77e- 1
## 7 spline_3 -7.99 12.8 -0.626 5.33e- 1
## 8 spline_35 8.50 10.9 0.776 4.40e- 1
## 9 spline_4 -9.39 8.38 -1.12 2.66e- 1
## 10 spline_45 -4.32 9.84 -0.439 6.62e- 1
## 11 spline_5 6.68 10.7 0.621 5.36e- 1
## 12 spline_55 -24.5 11.8 -2.07 4.20e- 2
## 13 spline_6 17.7 10.6 1.67 9.86e- 2
## 14 spline_65 -12.3 8.33 -1.48 1.43e- 1
## 15 spline_7 5.57 9.93 0.561 5.76e- 1
## 16 spline_75 1.12 11.4 0.0977 9.22e- 1
## 17 spline_8 0.944 10.2 0.0928 9.26e- 1
## 18 spline_85 0.843 9.91 0.0851 9.32e- 1
## 19 spline_9 14.9 7.83 1.90 6.04e- 2
## add fitted values to data
data.nonlin = mutate(data.nonlin,
underfit = fitted(piecewise.underfit),
fit = fitted(piecewise.fit),
overfit = fitted(piecewise.overfit))
## f test comparing to null linear fit to alternative quartic fit
anova(piecewise.underfit, piecewise.fit)
## Analysis of Variance Table
##
## Model 1: y ~ x
## Model 2: y ~ x + spline_15 + spline_5 + spline_9
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 98 73.444
## 2 95 8.240 3 65.205 250.6 < 0.00000000000000022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(data.nonlin, aes(y=y, x=x)) + geom_point() +
geom_line(aes(y = underfit), color = "red") +
geom_line(aes(y = fit), color = "red") +
theme_bw()

## f test comparing to null quartic fit to alternative twenty fit
anova(piecewise.fit, piecewise.overfit)
## Analysis of Variance Table
##
## Model 1: y ~ x + spline_15 + spline_5 + spline_9
## Model 2: y ~ x + spline_1 + spline_15 + spline_2 + spline_25 + spline_3 +
## spline_35 + spline_4 + spline_45 + spline_5 + spline_55 +
## spline_6 + spline_65 + spline_7 + spline_75 + spline_8 +
## spline_85 + spline_9
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 95 8.2395
## 2 81 6.7862 14 1.4533 1.239 0.2645
ggplot(data.nonlin, aes(y=y, x=x)) + geom_point() +
geom_line(aes(y = fit), color = "red") +
geom_line(aes(y = overfit), color = "red") +
theme_bw()

## anova table for all three
anova(piecewise.underfit, piecewise.fit, piecewise.overfit)
## Analysis of Variance Table
##
## Model 1: y ~ x
## Model 2: y ~ x + spline_15 + spline_5 + spline_9
## Model 3: y ~ x + spline_1 + spline_15 + spline_2 + spline_25 + spline_3 +
## spline_35 + spline_4 + spline_45 + spline_5 + spline_55 +
## spline_6 + spline_65 + spline_7 + spline_75 + spline_8 +
## spline_85 + spline_9
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 98 73.444
## 2 95 8.240 3 65.205 259.427 <0.0000000000000002 ***
## 3 81 6.786 14 1.453 1.239 0.2645
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
####################################################################
## mlb data example of LRTs
####################################################################
load("mlb11.RData")
head(mlb11)
## team runs at_bats hits homeruns bat_avg strikeouts
## 1 Texas Rangers 855 5659 1599 210 0.283 930
## 2 Boston Red Sox 875 5710 1600 203 0.280 1108
## 3 Detroit Tigers 787 5563 1540 169 0.277 1143
## 4 Kansas City Royals 730 5672 1560 129 0.275 1006
## 5 St. Louis Cardinals 762 5532 1513 162 0.273 978
## 6 New York Mets 718 5600 1477 108 0.264 1085
## stolen_bases wins new_onbase new_slug new_obs
## 1 143 96 0.340 0.460 0.800
## 2 102 90 0.349 0.461 0.810
## 3 49 95 0.340 0.434 0.773
## 4 153 71 0.329 0.415 0.744
## 5 57 90 0.341 0.425 0.766
## 6 130 77 0.335 0.391 0.725
## fit the full model
linmod = lm(runs ~ at_bats + hits + homeruns + stolen_bases, data = mlb11)
## compare to a null model with two predictors
linmod.null1 = lm(runs ~ hits + homeruns, data = mlb11)
anova(linmod.null1, linmod)
## Analysis of Variance Table
##
## Model 1: runs ~ hits + homeruns
## Model 2: runs ~ at_bats + hits + homeruns + stolen_bases
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 27 27128
## 2 25 18020 2 9107.8 6.3178 0.006015 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## LRT
Delta = -2*(logLik(linmod.null1) - logLik(linmod))
1-pchisq(Delta, 2)
## 'log Lik.' 0.002163305 (df=4)
####################################################################
## illustration of confidence ellipses
####################################################################
library(ellipse)
##
## Attaching package: 'ellipse'
## The following object is masked from 'package:car':
##
## ellipse
## The following object is masked from 'package:graphics':
##
## pairs
CI.ellipse = as.data.frame(ellipse(linmod,c(2,3)))
est = as.data.frame(t(as.matrix(coef(linmod)[2:3])))
## plot the joint confidence region
ggplot(CI.ellipse, aes(x = at_bats, y = hits)) + geom_path() +
geom_hline(yintercept = confint(linmod)[3,], linetype = 2) +
geom_vline(xintercept = confint(linmod)[2,], linetype = 2) +
geom_point(data = est, size = 4)

####################################################################
## polynomial example of pointwise intervals
####################################################################
## generate data
set.seed(1)
x = runif(100, 0, 1)
x = sort(x)
y = -30*(x-.5)^2 + 100*(x-.5)^4 + rnorm(100, 0, .3)
plot.dat = as.data.frame(cbind(y,x))
## fit data with quartic
quartfit = lm(y~poly(x,4))
plot.dat$quart_fit = quartfit$fitted.values
## plot of fit
plot = ggplot(plot.dat, aes(y=y, x=x)) + geom_point(size = 2) +
geom_line(aes(y=quart_fit, x = x), color = "red", size = 1)
plot

## use quartfit to find variance matrix of estimated coefs
VarMat = (summary(quartfit)$sigma)^2 * summary(quartfit)$cov.unscaled
x = sort(x)
x.poly = cbind(1, poly(x, 4))
## ci's at a few new x's
fitted = x.poly[50,] %*% quartfit$coef
se = sqrt(x.poly[50,] %*% VarMat %*% x.poly[50,])
ybound = fitted +c(-2, 2) * se
## Warning in c(-2, 2) * se: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
## Warning in fitted + c(-2, 2) * se: Recycling array of length 1 in array-vector arithmetic is deprecated.
## Use c() or as.vector() instead.
plot + geom_segment(aes(x = x[50], y = ybound[1], xend = x[50], yend = ybound[2]), color = "blue", size = 1.1)

## ci's at all new x's
fitted.overall = x.poly %*% quartfit$coef
se.fitted = sqrt(diag(x.poly %*% VarMat %*% t(x.poly)))
plot.dat$UB = fitted.overall + 2 * se.fitted
plot.dat$LB = fitted.overall - 2 * se.fitted
plot + geom_segment(aes(x = x[50], y = ybound[1], xend = x[50], yend = ybound[2]), color = "blue", size = 1.1) +
geom_line(data = plot.dat, aes(y=UB, x = x), color = "blue", linetype = 2) +
geom_line(data = plot.dat, aes(y=LB, x = x), color = "blue", linetype = 2)

####################################################################
## mother/daughter height to show difference in CI and PI
####################################################################
library(alr3)
data(heights)
x = heights$Mheight
y = heights$Dheight
linmod = lm(y~x)
## get pointwise CIs
x.design = cbind(1, x)
VarMat = summary(linmod)$sigma * summary(linmod)$cov.unscaled
fitted = x.design %*% linmod$coef
se.fitted = sqrt(diag(x.design %*% VarMat %*% t(x.design)))
UB = fitted + 2 * se.fitted
LB = fitted - 2 * se.fitted
plot.dat = as.data.frame(cbind(x, y, fitted, UB, LB))
plot = ggplot(plot.dat, aes(x = x, y = y)) + geom_point(color = "red") +
geom_line(aes(x = x, y = fitted), color = "blue") +
geom_line(aes(x = x, y = UB), color = "blue", linetype = 2) +
geom_line(aes(x = x, y = LB), color = "blue", linetype = 2)
## plot prediction interval for one subject
se.pred = sqrt(summary(linmod)$sigma^2 * (1 + x.design[200,] %*% summary(linmod)$cov.unscaled %*% x.design[200,]))
ybound = fitted[200] +c(-2, 2) * se.pred
## Warning in c(-2, 2) * se.pred: Recycling array of length 1 in vector-array arithmetic is deprecated.
## Use c() or as.vector() instead.
plot + geom_segment(aes(x = x[200], y = ybound[1],
xend = x[200], yend = ybound[2]),
color = "blue", size = 1.1)

####################################################################
Resampling methods
####################################################################
## intuition for bootstrapping
####################################################################
n = 200
set.seed(5)
## generate a single sample
data.noncst = data.frame(x = rexp(n, rate = .1)) %>%
mutate(y = 2 + 3*x + x * rnorm(n))
ggplot(data.noncst, aes(x=x, y=y)) + geom_point() +
geom_smooth(method = "lm", fill = "lightblue") + theme_bw()

## generate many samples from the true distribution
p <- list()
for(i in 1:16){
data.noncst.cur = data.frame(x = rexp(n, rate = .1)) %>%
mutate(y = 2 + 3*x + x * rnorm(n))
p[[i]] <- ggplot(data.noncst.cur, aes(x=x, y=y)) + geom_point() +
geom_smooth(method = "lm", fill = "lightblue") + theme_bw()
}
do.call(grid.arrange, p)

## generate many samples from the true distribution
beta.hat = data.frame(
b0 = rep(NA, 1000),
b1 = rep(NA, 1000))
for(i in 1:1000){
data.noncst.cur = data.frame(x = rexp(n, rate = .1)) %>%
mutate(y = 2 + 3*x + x * rnorm(n))
beta.hat[i,] = coef(lm(y~x, data = data.noncst.cur))
}
ggplot(data.noncst, aes(x=x, y=y)) + geom_point() +
geom_abline(data = beta.hat, aes(intercept = b0, slope = b1), alpha = .1) +
geom_smooth(method = "lm", fill = "lightblue") + theme_bw()

## implement the bootstrap on the original dataset
beta.hat = data.frame(
b0 = rep(NA, 1000),
b1 = rep(NA, 1000))
for(i in 1:1000){
data.cur = sample_frac(data.noncst, size = 1, replace = TRUE)
beta.hat[i,] = coef(lm(y~x, data.cur))
}
ggplot(data.noncst, aes(x=x, y=y)) + geom_point() +
geom_abline(data = beta.hat, aes(intercept = b0, slope = b1), alpha = .1) +
geom_smooth(method = "lm", fill = "lightblue") + theme_bw()

####################################################################
## prestige example for the bootstrap
####################################################################
library(car)
## fit the model
linmod = lm(prestige ~ income, data = Prestige)
## plot data and regression fit
ggplot(Prestige, aes(x = income, y = prestige)) + geom_point() +
geom_smooth(method = "lm") + theme_bw()

## define a vector for the bootstrapped estimates
betaHatBS = data.frame(b1.est = rep(NA, 10000))
## use a loop to do the bootstrap
for(i in 1:10000){
data.cur = sample_frac(Prestige, size = 1, replace = TRUE)
betaHatBS$b1.est[i] = lm(prestige ~ income, data = data.cur)$coef[2]
}
betaHatBS$b1.null = rnorm(10000, linmod$coef[2], summary(linmod)$coef[2,2])
## make a plot of bootstrap and normal-based distributions
ggplot(betaHatBS, aes(x = b1.est)) + geom_density(fill = "blue", alpha = .3) +
geom_density(aes(x = b1.null), fill = "red", alpha = .3) + theme_bw()

####################################################################
## permutation testing
####################################################################
n = 20
set.seed(1)
## generate a single sample
data.noncst = data.frame(x = rexp(n, rate = .1)) %>%
mutate(y = 2 + 1 * x + x * rnorm(n))
ggplot(data.noncst, aes(x=x, y=y)) + geom_point() +
geom_smooth(method = "lm", fill = "lightblue") + theme_bw()

## generate a few permuted datasets
p <- list()
for(i in 1:16){
data.noncst.cur = mutate(data.noncst, x = sample(x, length(x), replace = FALSE))
p[[i]] <- ggplot(data.noncst.cur, aes(x=x, y=y)) + geom_point() +
geom_smooth(method = "lm", fill = "lightblue") + theme_bw()
}
do.call(grid.arrange, p)

## do enough permutations to test
obs.coef = coef(lm(y ~ x, data = data.noncst))[2]
b1 = data.frame(b1.null = rep(NA, 10000))
for(i in 1:10000){
data.noncst.cur = mutate(data.noncst, x = sample(x, length(x), replace = FALSE))
b1$b1.null[i] = coef(lm(y ~ x, data = data.noncst.cur))[2]
}
ggplot(b1, aes(x = b1.null)) + geom_density(fill = "blue", alpha = .3) +
geom_vline(xintercept = obs.coef, col = "red") + theme_bw()

2 * mean(b1>obs.coef)
## [1] 0.0074
####################################################################
## cross validation
####################################################################
## generate data
set.seed(1)
data.nonlin = data.frame(x = runif(100, 0, 1)) %>%
mutate(y = -30*(x-.5)^2 + 100*(x-.5)^4 + rnorm(100, 0, .3))
## code for generating an overly-rich piecewise spline basis
PW.basis = sapply(seq(.1, .9, by = 0.05), function(u) (data.nonlin$x - u) * (data.nonlin$x >= u))
colnames(PW.basis) = paste0("spline_", substr(as.character(seq(.1, .9, by = 0.05)), 3, 5))
PW.basis = as.data.frame(PW.basis)
data.nonlin = bind_cols(data.nonlin, PW.basis)
MSEs = data.frame(
model1 = rep(NA, 100),
model2 = rep(NA, 100),
model3 = rep(NA, 100),
model4 = rep(NA, 100),
model5 = rep(NA, 100)
)
for(i in 1:100){
set.seed(i)
data.nonlin = mutate(data.nonlin,
cv_group = sample(1:100, 100, replace = FALSE) <= 80,
cv_group = factor(cv_group, levels = c(TRUE, FALSE),
labels = c("train", "test")))
data.train = filter(data.nonlin, cv_group == "train")
data.test = filter(data.nonlin, cv_group == "test")
fit.1 = lm(y ~ x, data = data.train)
MSEs[i,1] = mean((data.test$y - predict(fit.1, newdata = data.test))^2)
fit.2 = lm(y ~ x + spline_5, data = data.train)
MSEs[i,2] = mean((data.test$y - predict(fit.2, newdata = data.test))^2)
fit.3 = lm(y ~ x + spline_2 + spline_5 + spline_8, data = data.train)
MSEs[i,3] = mean((data.test$y - predict(fit.3, newdata = data.test))^2)
fit.4 = lm(y ~ x + spline_15 + spline_5 + spline_9, data = data.train)
MSEs[i,4] = mean((data.test$y - predict(fit.4, newdata = data.test))^2)
fit.5 = lm(y ~ x + spline_1 + spline_15 + spline_2 + spline_25 + spline_3 + spline_35 +
spline_4 + spline_45 + spline_5 + spline_55 + spline_6 + spline_65 +
spline_7 + spline_75 + spline_8 + spline_85 + spline_9,
data = data.train)
MSEs[i,5] = mean((data.test$y - predict(fit.5, newdata = data.test))^2)
}
slice(MSEs, 1) %>% gather(key = fit, value = MSE, model1:model5) %>%
ggplot(aes(x = fit, y = MSE)) + geom_point() + theme_bw()

gather(MSEs, key = fit, value = MSE, model1:model5) %>%
ggplot(aes(x = fit, y = MSE)) + geom_violin(alpha = .4, fill = "blue") +
theme_bw() + ylim(0, 1.5)
## Warning: Removed 1 rows containing non-finite values (stat_ydensity).

apply(MSEs, 2, mean)
## model1 model2 model3 model4 model5
## 0.77698612 0.19078440 0.12511054 0.09944018 1.94061266
####################################################################
Regression diagnostics & Model checking
####################################################################
## fit three models with differing error structures; create
## residual and qqplots for each
####################################################################
set.seed(1)
## generate x's and y's
data.example = data.frame(x = runif(200, 0, 10)) %>%
mutate(y1 = 1+2*x + rnorm(200),
y2 = 1+2*x + rnorm(200) * x,
y3 = 1+(x-4)^2 + rnorm(200, mean = 0, sd = 5)) %>%
arrange(x)
## fit models
fit1 = lm(y1~x, data = data.example)
fit2 = lm(y2~x, data = data.example)
fit3 = lm(y3~x, data = data.example)
## save fitted values and residuals
data.example = data.example %>%
mutate(fitted1 = fitted(fit1), resid1 = resid(fit1),
fitted2 = fitted(fit2), resid2 = resid(fit2),
fitted3 = fitted(fit3), resid3 = resid(fit3))
## plot data
p = list()
p[[1]] = ggplot(data.example, aes(x = x, y = y1)) + geom_point() + theme_bw()
p[[2]] = ggplot(data.example, aes(x = x, y = y2)) + geom_point() + theme_bw()
p[[3]] = ggplot(data.example, aes(x = x, y = y3)) + geom_point() + theme_bw()
grid.arrange(p[[1]], p[[2]], p[[3]], nrow = 1, ncol = 3)

## plot residuals against fitted values
p = list()
p[[1]] = ggplot(data.example, aes(x = fitted1, y = resid1)) + geom_point() + theme_bw()
p[[2]] = ggplot(data.example, aes(x = fitted2, y = resid2)) + geom_point() + theme_bw()
p[[3]] = ggplot(data.example, aes(x = fitted3, y = resid3)) + geom_point() + theme_bw()
grid.arrange(p[[1]], p[[2]], p[[3]], nrow = 1, ncol = 3)

## qqplot
p = list()
p[[1]] = ggplot(data.example, aes(sample = resid1)) + stat_qq() + theme_bw()
p[[2]] = ggplot(data.example, aes(sample = resid2)) + stat_qq() + theme_bw()
p[[3]] = ggplot(data.example, aes(sample = resid3)) + stat_qq() + theme_bw()
grid.arrange(p[[1]], p[[2]], p[[3]], nrow = 1, ncol = 3)

####################################################################
## leverage plots 1
####################################################################
set.seed(1)
data.example = data.frame(x = runif(100, 0, 10)) %>%
mutate(y = 1+2*x + rnorm(100)) %>%
mutate(x = replace(x, 1, 12.5),
y = replace(y, 1, 10))
fit = lm(y~x, data = data.example)
ggplot(data.example, aes(x=x, y=y)) + geom_point() +
stat_smooth(method = "lm", se = FALSE) + theme_bw()

## plot hat values
data.example = mutate(data.example,
hatvals = hatvalues(fit),
index = seq_along(hatvals))
ggplot(data.example, aes(x=index, y=hatvals)) + geom_point() +
ylim(0,0.09) + geom_hline(yintercept = 2*2/101) +
theme_bw()

####################################################################
## leverage plots 2
####################################################################
set.seed(1)
data.example = data.frame(x = runif(100, 0, 10)) %>%
mutate(y = 1+2*x + rnorm(100)) %>%
mutate(x = replace(x, 1, 5.5),
y = replace(y, 1, 3))
fit = lm(y~x, data = data.example)
ggplot(data.example, aes(x=x, y=y)) + geom_point() +
stat_smooth(method = "lm", se = FALSE) + theme_bw()

## plot hat values
data.example = mutate(data.example,
hatvals = hatvalues(fit),
index = seq_along(hatvals))
ggplot(data.example, aes(x=index, y=hatvals)) + geom_point() +
ylim(0,0.09) + geom_hline(yintercept = 2*2/101) +
theme_bw()

####################################################################
## leverage plots 3
####################################################################
set.seed(1)
data.example = data.frame(x = rnorm(100, 1, .5)) %>%
mutate(y = 3 + rnorm(100)) %>%
mutate(x = replace(x, 1, 10),
y = replace(y, 1, 20))
fit = lm(y~x, data = data.example)
ggplot(data.example, aes(x=x, y=y)) + geom_point() +
stat_smooth(method = "lm", se = FALSE) + theme_bw()

## plot hat values
data.example = mutate(data.example,
hatvals = hatvalues(fit),
index = seq_along(hatvals))
ggplot(data.example, aes(x=index, y=hatvals)) + geom_point() +
geom_hline(yintercept = 2*2/101) +
theme_bw()

####################################################################
## cook's distance
####################################################################
## plot hat values
data.example = mutate(data.example,
cooks = cooks.distance(fit),
index = seq_along(cooks))
ggplot(data.example, aes(x=index, y=cooks)) + geom_point() +
geom_hline(yintercept = 2*2/101) +
theme_bw()

####################################################################
Model selection
####################################################################
## load the life expectancy data and run backward selection
####################################################################
data(state)
statedata = data.frame(state.x77,row.names=state.abb)
g = lm(Life.Exp ~., data=statedata)
summary(g)
##
## Call:
## lm(formula = Life.Exp ~ ., data = statedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.48895 -0.51232 -0.02747 0.57002 1.49447
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.94322411113 1.74797537818 40.586 < 0.0000000000000002 ***
## Population 0.00005180036 0.00002918703 1.775 0.0832 .
## Income -0.00002180424 0.00024442561 -0.089 0.9293
## Illiteracy 0.03382032136 0.36627989117 0.092 0.9269
## Murder -0.30112317045 0.04662072985 -6.459 0.0000000868 ***
## HS.Grad 0.04892947888 0.02332327770 2.098 0.0420 *
## Frost -0.00573500110 0.00314322966 -1.825 0.0752 .
## Area -0.00000007383 0.00000166816 -0.044 0.9649
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7448 on 42 degrees of freedom
## Multiple R-squared: 0.7362, Adjusted R-squared: 0.6922
## F-statistic: 16.74 on 7 and 42 DF, p-value: 0.0000000002534
AIC(g)
## [1] 121.7092
g = lm(Life.Exp ~ . - Area, data=statedata)
summary(g)
##
## Call:
## lm(formula = Life.Exp ~ . - Area, data = statedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.49047 -0.52533 -0.02546 0.57160 1.50374
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.98931852 1.38745441 51.165 < 0.0000000000000002 ***
## Population 0.00005188 0.00002879 1.802 0.0785 .
## Income -0.00002444 0.00023429 -0.104 0.9174
## Illiteracy 0.02845881 0.34163295 0.083 0.9340
## Murder -0.30182314 0.04334432 -6.963 0.0000000145 ***
## HS.Grad 0.04847232 0.02066727 2.345 0.0237 *
## Frost -0.00577576 0.00297023 -1.945 0.0584 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7361 on 43 degrees of freedom
## Multiple R-squared: 0.7361, Adjusted R-squared: 0.6993
## F-statistic: 19.99 on 6 and 43 DF, p-value: 0.00000000005362
AIC(g)
## [1] 119.7116
g = lm(Life.Exp ~ . - (Area + Illiteracy), data=statedata)
summary(g)
##
## Call:
## lm(formula = Life.Exp ~ . - (Area + Illiteracy), data = statedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.4892 -0.5122 -0.0329 0.5645 1.5166
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 71.06575094 1.02894145 69.067 < 0.0000000000000002 ***
## Population 0.00005115 0.00002709 1.888 0.0657 .
## Income -0.00002477 0.00023160 -0.107 0.9153
## Murder -0.30000770 0.03704182 -8.099 0.000000000291 ***
## HS.Grad 0.04775797 0.01859079 2.569 0.0137 *
## Frost -0.00590986 0.00246778 -2.395 0.0210 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7277 on 44 degrees of freedom
## Multiple R-squared: 0.7361, Adjusted R-squared: 0.7061
## F-statistic: 24.55 on 5 and 44 DF, p-value: 0.00000000001019
AIC(g)
## [1] 117.7196
g = lm(Life.Exp ~ . - (Area + Illiteracy + Income), data=statedata)
summary(g)
##
## Call:
## lm(formula = Life.Exp ~ . - (Area + Illiteracy + Income), data = statedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.47095 -0.53464 -0.03701 0.57621 1.50683
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 71.02712853 0.95285296 74.542 < 0.0000000000000002 ***
## Population 0.00005014 0.00002512 1.996 0.05201 .
## Murder -0.30014880 0.03660946 -8.199 0.000000000177 ***
## HS.Grad 0.04658225 0.01482706 3.142 0.00297 **
## Frost -0.00594329 0.00242087 -2.455 0.01802 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7197 on 45 degrees of freedom
## Multiple R-squared: 0.736, Adjusted R-squared: 0.7126
## F-statistic: 31.37 on 4 and 45 DF, p-value: 0.000000000001696
AIC(g)
## [1] 115.7326
g = lm(Life.Exp ~ . - (Area + Illiteracy + Income + Population), data=statedata)
summary(g)
##
## Call:
## lm(formula = Life.Exp ~ . - (Area + Illiteracy + Income + Population),
## data = statedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5015 -0.5391 0.1014 0.5921 1.2268
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 71.036379 0.983262 72.246 < 0.0000000000000002 ***
## Murder -0.283065 0.036731 -7.706 0.000000000804 ***
## HS.Grad 0.049949 0.015201 3.286 0.00195 **
## Frost -0.006912 0.002447 -2.824 0.00699 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7427 on 46 degrees of freedom
## Multiple R-squared: 0.7127, Adjusted R-squared: 0.6939
## F-statistic: 38.03 on 3 and 46 DF, p-value: 0.000000000001634
AIC(g)
## [1] 117.9743
####################################################################
Penalized Regression (ridge, lasso)
####################################################################
## load the life expectancy data and run a model using all variables
####################################################################
data(state)
statedata = data.frame(state.x77,row.names=state.abb)
model.full = lm(Life.Exp ~., data=statedata)
coef(model.full)
## (Intercept) Population Income Illiteracy
## 70.94322411112951 0.00005180036383 -0.00002180423783 0.03382032135529
## Murder HS.Grad Frost Area
## -0.30112317045183 0.04892947888172 -0.00573500110356 -0.00000007383166
summary(model.full)
##
## Call:
## lm(formula = Life.Exp ~ ., data = statedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.48895 -0.51232 -0.02747 0.57002 1.49447
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.94322411113 1.74797537818 40.586 < 0.0000000000000002 ***
## Population 0.00005180036 0.00002918703 1.775 0.0832 .
## Income -0.00002180424 0.00024442561 -0.089 0.9293
## Illiteracy 0.03382032136 0.36627989117 0.092 0.9269
## Murder -0.30112317045 0.04662072985 -6.459 0.0000000868 ***
## HS.Grad 0.04892947888 0.02332327770 2.098 0.0420 *
## Frost -0.00573500110 0.00314322966 -1.825 0.0752 .
## Area -0.00000007383 0.00000166816 -0.044 0.9649
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7448 on 42 degrees of freedom
## Multiple R-squared: 0.7362, Adjusted R-squared: 0.6922
## F-statistic: 16.74 on 7 and 42 DF, p-value: 0.0000000002534
####################################################################
## try a few ridge regression penalties
####################################################################
library(MASS)
model.ridge1 = lm.ridge(Life.Exp ~., data=statedata, lambda = 1000000)
coef(model.ridge1)
## Population Income
## 70.87833376997693335 -0.00000000102274337 0.00000003716046615
## Illiteracy Murder HS.Grad
## -0.00006479091177417 -0.00001419577213165 0.00000483752778187
## Frost Area
## 0.00000033830292367 -0.00000000008442886
model.ridge2 = lm.ridge(Life.Exp ~., data=statedata, lambda = .0000001)
coef(model.ridge2)
## Population Income Illiteracy
## 70.94322410514499 0.00005180036336 -0.00002180423585 0.03382031584235
## Murder HS.Grad Frost Area
## -0.30112316886285 0.04892947874928 -0.00573500108468 -0.00000007383168
####################################################################
## use cross-validation to choose lambda in ridge regression
####################################################################
set.seed(12)
grid.lam = seq(-2, 14, by = .1)
lam = 2^(grid.lam)
COEFS = matrix(NA, nrow = 7, ncol = length(lam))
MSE = matrix(NA, nrow = 10, ncol = length(lam))
for(k in 1:10){
statedata = mutate(statedata,
cv_group = sample(1:50, 50, replace = FALSE) <= 40,
cv_group = factor(cv_group, levels = c(TRUE, FALSE),
labels = c("train", "test")))
for(l in 1:length(lam)){
data.train = filter(statedata, cv_group == "train") %>% dplyr::select(., -cv_group)
data.test = filter(statedata, cv_group == "test") %>% dplyr::select(-cv_group)
model.cur = lm.ridge(Life.Exp ~ ., data=data.train, lambda = lam[l])
predictions = as.matrix(cbind(1, dplyr::select(data.test, -Life.Exp))) %*%
coef(model.cur)
MSE[k,l] = mean(as.matrix((dplyr::select(data.test, Life.Exp)) - predictions)^2)
COEFS[,l] = coef(lm.ridge(Life.Exp ~.,
data = dplyr::select(statedata, -cv_group),
lambda = lam[l]))[-1]
}
}
## plot CV curve for ridge regression
plot.dat = data.frame(grid = grid.lam, MSE = apply(MSE, 2, mean))
ggplot(plot.dat, aes(x = grid, y = MSE)) + geom_path() +
labs(x = expression(log[2](lambda)), ylab = "CV MSE")

## coefficient plot for ridge regression
rownames(COEFS) = colnames(dplyr::select(statedata, -Life.Exp, -cv_group))
colnames(COEFS) = grid.lam
plot.dat = melt(COEFS)
ggplot(plot.dat, aes(x = X2, y = value, group = X1, color = X1)) + geom_path() +
labs(x = expression(log[2](lambda)), ylab = "Coefficient")

## output of final model
Lam.Final = lam[which.min(apply(MSE, 2, mean))]
model.ridge3 = lm.ridge(Life.Exp ~., data=statedata, lambda = Lam.Final)
coef(model.ridge3)
## Population Income Illiteracy
## 70.7333872194932 0.0000399866985 0.0000072077105 -0.1172716164569
## Murder HS.Grad Frost Area
## -0.2510285576884 0.0430902564100 -0.0040978234118 -0.0000002492987
## cv_grouptest
## 0.4499632402995
####################################################################
## try a few lasso penalties
####################################################################
rm(list = ls())
data(state)
statedata = data.frame(state.x77,row.names=state.abb)
X = as.matrix(statedata[,-4])
y = statedata[,4]
model.lasso1 = glmnet(X, y, lambda = 0.00001)
coef(model.lasso1)
## 8 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 70.94187060654372
## Population 0.00005185263339
## Income -0.00002191146801
## Illiteracy 0.03467775014259
## Murder -0.30121570685934
## HS.Grad 0.04894537973228
## Frost -0.00573085295669
## Area -0.00000007370497
model.lasso2 = glmnet(X, y, lambda = 0.01)
coef(model.lasso2)
## 8 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 71.01048181388
## Population 0.00004762476
## Income .
## Illiteracy .
## Murder -0.29445650829
## HS.Grad 0.04551700912
## Frost -0.00554215728
## Area .
model.lasso3 = glmnet(X, y, lambda = 10)
coef(model.lasso3)
## 8 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 70.8786
## Population 0.0000
## Income .
## Illiteracy .
## Murder .
## HS.Grad .
## Frost .
## Area .
####################################################################
## use cross-validation to choose lambda
####################################################################
set.seed(12)
grid.lam = seq(-9, 0, by = .1)
lam = 2^(grid.lam)
GROUP = sample(rep(1:5, each = 10), 50)
COEFS = matrix(NA, nrow = 7, ncol = length(lam))
MSE = matrix(NA, nrow = 10, ncol = length(lam))
for(k in 1:10){
statedata = mutate(statedata,
cv_group = sample(1:50, 50, replace = FALSE) <= 40,
cv_group = factor(cv_group, levels = c(TRUE, FALSE),
labels = c("train", "test")))
for(l in 1:length(lam)){
data.train = filter(statedata, cv_group == "train") %>% dplyr::select(., -cv_group)
data.test = filter(statedata, cv_group == "test") %>% dplyr::select(-cv_group)
X.test = dplyr::select(data.train, -Life.Exp) %>% as.matrix
y.test = dplyr::select(data.train, Life.Exp) %>% as.matrix %>% as.vector
model.cur = glmnet(X.test, y.test, lambda = lam[l])
predictions = predict(model.cur,
newx = dplyr::select(data.test, -Life.Exp) %>% as.matrix)
y.test = dplyr::select(data.test, Life.Exp) %>% as.matrix %>% as.vector
MSE[k,l] = mean((y.test - predictions)^2)
X = dplyr::select(statedata, -Life.Exp, -cv_group) %>% as.matrix
y = dplyr::select(statedata, Life.Exp, -cv_group) %>% as.matrix %>% as.vector
COEFS[,l] = coef(glmnet(X, y, lambda = lam[l]))[-1]
}
}
## plot CV curve for ridge regression
plot.dat = data.frame(grid = grid.lam, MSE = apply(MSE, 2, mean))
ggplot(plot.dat, aes(x = grid, y = MSE)) + geom_path() +
labs(x = expression(log[2](lambda)), ylab = "CV MSE")

## coefficient plot for ridge regression
rownames(COEFS) = colnames(dplyr::select(statedata, -Life.Exp, -cv_group))
colnames(COEFS) = grid.lam
plot.dat = melt(COEFS)
ggplot(plot.dat, aes(x = X2, y = value, group = X1, color = X1)) + geom_path() +
labs(x = expression(log[2](lambda)), ylab = "Coefficient")

## output of final model
Lam.Final = lam[which(apply(MSE, 2, mean) == min(apply(MSE, 2, mean)))]
model.lasso4 = glmnet(X, y, lambda = Lam.Final)
coef(model.lasso4)
## 8 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 70.95395594683
## Population 0.00003900914
## Income .
## Illiteracy .
## Murder -0.27500983918
## HS.Grad 0.04187267828
## Frost -0.00417151783
## Area .
####################################################################
Splines and Penalized Splines
####################################################################
## load lidar data
####################################################################
data(lidar)
x = lidar$range
p1 = ggplot(lidar, aes(x = range, y = logratio)) + geom_point()
p1

####################################################################
## construct spline basis and fit piecewise linear model
####################################################################
range = lidar$range
y = lidar$logratio
knots <- c(550, 625)
X.des = cbind(1, range, sapply(knots, function(k)
((range - k > 0) * (range - k))))
lm.lin = lm(y ~ X.des - 1)
summary(lm.lin)
##
## Call:
## lm(formula = y ~ X.des - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.294936 -0.034261 0.001713 0.039888 0.255162
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## X.des -0.01444288 0.06873539 -0.210 0.834
## X.desrange -0.00008407 0.00014266 -0.589 0.556
## X.des -0.00704279 0.00038342 -18.368 <0.0000000000000002 ***
## X.des 0.00572319 0.00051535 11.105 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08005 on 217 degrees of freedom
## Multiple R-squared: 0.9617, Adjusted R-squared: 0.961
## F-statistic: 1362 on 4 and 217 DF, p-value: < 0.00000000000000022
lidar.pred = data.frame(range = range, fitted = fitted(lm.lin))
p1 + geom_path(data = lidar.pred, aes(x = range, y = fitted), color = "red")

####################################################################
## again, with different knots
####################################################################
knots <- c(500, 650)
X.des = cbind(1, range, sapply(knots, function(k)
((range - k > 0) * (range - k))))
lm.lin = lm(y ~ X.des - 1)
summary(lm.lin)
##
## Call:
## lm(formula = y ~ X.des - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.304857 -0.057340 0.006559 0.046501 0.255679
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## X.des -0.4732124 0.1167565 -4.053 0.00007046 ***
## X.desrange 0.0009918 0.0002533 3.915 0.000121 ***
## X.des -0.0052984 0.0003695 -14.340 < 0.0000000000000002 ***
## X.des 0.0027191 0.0005668 4.798 0.00000298 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09382 on 217 degrees of freedom
## Multiple R-squared: 0.9474, Adjusted R-squared: 0.9464
## F-statistic: 976.4 on 4 and 217 DF, p-value: < 0.00000000000000022
lidar.pred = data.frame(range = range, fitted = fitted(lm.lin))
p1 + geom_path(data = lidar.pred, aes(x = range, y = fitted), color = "red")

####################################################################
## construct spline basis and fit piecewise quadratic model
####################################################################
knots <- c(500, 600, 675)
X.des = cbind(1, range, range^2, sapply(knots, function(k)
((range - k > 0) * (range - k)^2)))
lm.lin = lm(y ~ X.des - 1)
summary(lm.lin)
##
## Call:
## lm(formula = y ~ X.des - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.275013 -0.038263 0.002416 0.041911 0.244874
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## X.des 1.228093762 1.058287506 1.160 0.2472
## X.desrange -0.005749387 0.004574109 -1.257 0.2101
## X.des 0.000006424 0.000004905 1.310 0.1917
## X.des -0.000049227 0.000008055 -6.111 0.00000000459 ***
## X.des 0.000099100 0.000010115 9.798 < 0.0000000000000002 ***
## X.des -0.000097083 0.000038415 -2.527 0.0122 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08241 on 215 degrees of freedom
## Multiple R-squared: 0.9598, Adjusted R-squared: 0.9586
## F-statistic: 854.7 on 6 and 215 DF, p-value: < 0.00000000000000022
lidar.pred = data.frame(range = range, fitted = fitted(lm.lin))
p1 + geom_path(data = lidar.pred, aes(x = range, y = fitted), color = "red")

####################################################################
## b-spline basis
####################################################################
library(splines)
bspline = bs(x = range, df = 6, degree = 3, intercept = TRUE)
rownames(bspline) = range
colnames(bspline) = paste0("Basis ", 1:6)
bspline.m = melt(bspline)
ggplot(bspline.m, aes(x = X1, y = value, group = X2, color = X2)) + geom_path()

####################################################################
## define CV groups, construct design matrices for training and validation data
####################################################################
set.seed(1001)
GROUP = lidar$GROUP = sample(c(0,1), size = length(y), prob = c(.8, .2), replace = TRUE)
knots <- quantile(range, seq(0, 1, length = 45))[-c(1, 45)]
X.full = cbind(1, range, sapply(knots, function(k) ((range - k > 0) * (range - k))))
X.train = X.full[which(GROUP != 1), ]; y.train = y[which(GROUP != 1)]
X.valid = X.full[which(GROUP == 1), ]; y.valid = y[which(GROUP == 1)]
####################################################################
## construct penalty matrix; estimate ridge regression for a few lambdas
####################################################################
P = diag(1, dim(X.full)[2], dim(X.full)[2])
P[1,1] = 0
lambda = 10^(-2:7)
beta.lm = solve(t(X.train) %*% X.train)%*% t(X.train) %*% y.train
beta.lam1 = solve(t(X.train) %*% X.train + lambda[4]* P)%*% t(X.train) %*% y.train
beta.lam2 = solve(t(X.train) %*% X.train + lambda[10]* P)%*% t(X.train) %*% y.train
beta.lam3 = solve(t(X.train) %*% X.train + lambda[6]* P)%*% t(X.train) %*% y.train
####################################################################
## plot results
####################################################################
p1 = ggplot(lidar, aes(x = range, y = logratio, color = as.factor(GROUP))) +
geom_point() +
scale_color_manual(values = c("black", "red"), guide = "none")
p1

lidar.pred = data.frame(range = range[which(GROUP == 0)], fitted = X.train %*% beta.lm)
p1 + geom_path(data = lidar.pred, aes(x = range, y = fitted), color = "blue")

lidar.pred$fitted = X.train %*% beta.lam1
p1 + geom_path(data = lidar.pred, aes(x = range, y = fitted), color = "blue")

lidar.pred$fitted = X.train %*% beta.lam2
p1 + geom_path(data = lidar.pred, aes(x = range, y = fitted), color = "blue")

lidar.pred$fitted = X.train %*% beta.lam3
p1 + geom_path(data = lidar.pred, aes(x = range, y = fitted), color = "blue")

####################################################################
## find cross-validation curve for one fold only
####################################################################
grid.lam = seq(-2, 7, by = .2)
lambda = 10^(grid.lam)
CV = rep(NA, length(lambda))
for(l in 1:length(lambda)){
beta.cur = solve(t(X.train) %*% X.train + lambda[l]* P)%*% t(X.train) %*% y.train
CV[l] = mean((y.valid - X.valid %*% beta.cur)^2 )
}
## plot CV curve for ridge regression
plot.dat = data.frame(grid = grid.lam, CV = CV)
ggplot(plot.dat, aes(x = grid, y = CV)) + geom_path() +
labs(x = expression(log[10](lambda)), ylab = "CV MSE")

####################################################################
## kernel smoothing
####################################################################
p1 = ggplot(lidar, aes(x = range, y = logratio)) + geom_point()
kern.smooth1 = as.data.frame(ksmooth(range, y, kernel = "normal", bandwidth = 50))
p1 + geom_path(data = kern.smooth1, aes(x = x, y = y), color = "blue")

kern.smooth2 = as.data.frame(ksmooth(range, y, kernel = "normal", bandwidth = 10))
p1 + geom_path(data = kern.smooth2, aes(x = x, y = y), color = "blue")

kern.smooth2 = as.data.frame(ksmooth(range, y, kernel = "normal", bandwidth = 500))
p1 + geom_path(data = kern.smooth2, aes(x = x, y = y), color = "blue")

####################################################################
Additive Models
## load nepalese data
data <- read.delim("nepal.dat", header = TRUE, sep=",")
data = data[order(data$weight),]
## divide data into training and validation data
set.seed(1)
data$GROUP = sample(c(0,1), prob = c(.2, .8),
size = dim(data)[1], replace = TRUE)
data.train = subset(data, GROUP == 1)
data.valid = subset(data, GROUP == 0)
## plot arm circ against weight
p1 = ggplot(data, aes(x = weight, y = armc,
color = as.factor(GROUP))) +
geom_point(alpha = .5) +
scale_color_manual(values = c("red", "black"), guide = FALSE)
p1

####################################################################
## some regression analyses of armcirc vs weight
####################################################################
CV = rep(NA, 6)
## f(x) = E(y)
fx = mean(data.train$armc)
p1 + geom_hline(yintercept = fx, color = "blue", size = 1.5)

CV[1] = mean((data.valid$armc - fx)^2)
## line
fx = lm(armc~weight, data = data.train)
p1 + stat_smooth(data = data.train,
method = lm, se = FALSE,
color = "blue", size = 1.5)

CV[2] = mean((data.valid$armc - predict(fx, newdata = data.valid))^2)
## poly
fx = lm(armc~poly(weight, 4), data = data.train)
plot.fx = data.frame(weight = data.train$weight, fx = fitted(fx))
p1 + geom_line(data = plot.fx,
aes(x = weight, y = fx, color = NULL),
color = "blue", size = 1.5)

CV[3] = mean((data.valid$armc - predict(fx, newdata = data.valid))^2)
## spline
data.train = mutate(data.train, weight.sp = (weight>7)*(weight - 7))
data.valid = mutate(data.valid, weight.sp = (weight>7)*(weight - 7))
fx = lm(armc ~ weight + weight.sp, data = data.train)
plot.fx = data.frame(weight = data.train$weight, fx = fitted(fx))
p1 + geom_line(data = plot.fx,
aes(x = weight, y = fx, color = NULL),
color = "blue", size = 1.5)

CV[4] = mean((data.valid$armc - predict(fx, newdata = data.valid))^2)
## penalized spline
fx = mgcv::gam(armc ~ s(weight), data = data.train)
plot.fx = data.frame(weight = data.train$weight, fx = fitted(fx))
p1 + geom_line(data = plot.fx,
aes(x = weight, y = fx, color = NULL),
color = "blue", size = 1.5)

CV[5] = mean((data.valid$armc - predict(fx, newdata = data.valid))^2)
## kernel smoother
plot.fx = with(data.train,
as.data.frame(ksmooth(weight, armc,
kernel = "normal",
bandwidth = 2)))
p1 + geom_line(data = plot.fx,
aes(x = x, y = y, color = NULL),
color = "blue", size = 1.5)

fx.cv = with(data.train, ksmooth(weight, armc, kernel = "normal",
bandwidth = 2,
x.points = data.valid$weight)$y)
CV[6] = mean((data.valid$armc[order(data.valid$weight)] - fx.cv)^2)
## compare CV
CV = data.frame(x = 1:6, CV = CV)
ggplot(CV, aes(x = x, y = CV)) +
geom_point() + geom_line() +
labs(x = "") +
scale_x_continuous(breaks = 1:6,
labels = c("Obs Mean", "SLR", "Poly",
"Spline", "P-Spline", "Kernel"))

####################################################################
## extended case study of arm circumference
####################################################################
## penalized spline
fx = mgcv::gam(armc ~ s(weight), data = data.train)
plot.fx = data.frame(weight = data.train$weight, fx = fitted(fx))
p1 + geom_line(data = plot.fx,
aes(x = weight, y = fx, color = NULL),
color = "blue", size = 1.5)

summary(fx)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## armc ~ s(weight)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.761 0.235 101.1 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(weight) 3.087 3.654 3452 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.94 Deviance explained = 94.1%
## GCV = 44.533 Scale est. = 44.306 n = 802
## penalized spline -- different penalty and spline basis
fx = gam(armc ~ s(weight, k = 100), data = data.train, sp = (.0001))
plot.fx = data.frame(weight = data.train$weight, fx = fitted(fx))
p1 + geom_line(data = plot.fx,
aes(x = weight, y = fx, color = NULL),
color = "blue", size = 1.5)

summary(fx)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## armc ~ s(weight, k = 100)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.7606 0.2498 95.13 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(weight) 96.53 98.77 113.1 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.933 Deviance explained = 94.1%
## GCV = 56.964 Scale est. = 50.037 n = 802
## separate boys and girls
ggplot(data.train, aes(x = weight, y = armc, color = as.factor(sex))) +
geom_point(alpha = .5) +
scale_color_manual(values = c("blue", "red"), guide = FALSE) +
stat_smooth(method = "gam", formula = y ~ s(x), se = FALSE, size = 1.5)

fx = gam(armc ~ sex + sex * weight + s(weight), data = data.train)
summary(fx)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## armc ~ sex + sex * weight + s(weight)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.92615 2.76527 3.228 0.001298 **
## sex 0.93802 0.59137 1.586 0.113095
## weight 0.70637 0.12325 5.731 0.0000000141 ***
## sex:weight -0.06365 0.01726 -3.688 0.000242 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(weight) 2.767 3.381 15.5 0.000000000109 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 12/13
## R-sq.(adj) = 0.941 Deviance explained = 94.2%
## GCV = 43.926 Scale est. = 43.575 n = 802
summary(lm(armc ~ sex + sex * weight, data = data.train))
##
## Call:
## lm(formula = armc ~ sex + sex * weight, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -83.139 -1.250 0.369 2.008 8.215
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.94813 0.94810 2.055 0.040226 *
## sex 1.34264 0.60640 2.214 0.027104 *
## weight 0.99946 0.02523 39.610 < 0.0000000000000002 ***
## sex:weight -0.06403 0.01735 -3.691 0.000239 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.83 on 798 degrees of freedom
## Multiple R-squared: 0.9374, Adjusted R-squared: 0.9371
## F-statistic: 3981 on 3 and 798 DF, p-value: < 0.00000000000000022
plot(fx)

## separate age groups
data.train = mutate(data.train,
ageGroup = cut(age, breaks = c(0, 22, 32, 40, 50, 100),
labels = 1:5))
data.valid = mutate(data.valid,
ageGroup = cut(age, breaks = c(0, 22, 32, 40, 50, 100),
labels = 1:5))
ggplot(data.train, aes(x = weight, y = armc, color = ageGroup)) +
geom_point(alpha = .5) +
stat_smooth(method = "gam", formula = y ~ s(x), se = FALSE, size = 1.5) +
scale_color_discrete(guide = FALSE)

## smooth effects of age and weight
fx = gam(armc ~ s(age) + s(weight), data = data.train)
summary(fx)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## armc ~ s(age) + s(weight)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.7606 0.2342 101.4 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(age) 1.528 1.879 4.343 0.0114 *
## s(weight) 2.694 3.234 3916.239 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.941 Deviance explained = 94.1%
## GCV = 44.293 Scale est. = 44.005 n = 802
CV[7,] = c(7, mean((data.valid$armc - predict(fx, newdata = data.valid))^2))
par(mfrow = c(1,2))
plot(fx)

## linear model for comparison
fx = lm(armc ~ age + weight, data = data.train)
summary(fx)
##
## Call:
## lm(formula = armc ~ age + weight, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -83.299 -0.509 0.131 0.880 8.122
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.340770 0.548202 13.39 < 0.0000000000000002 ***
## age -0.094170 0.012557 -7.50 0.00000000000017 ***
## weight 0.914034 0.008137 112.33 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.654 on 799 degrees of freedom
## Multiple R-squared: 0.9405, Adjusted R-squared: 0.9403
## F-statistic: 6314 on 2 and 799 DF, p-value: < 0.00000000000000022
CV[8,] = c(8, mean((data.valid$armc - predict(fx, newdata = data.valid))^2))
## compare CV
ggplot(CV, aes(x = x, y = CV)) +
geom_point() +
geom_line() +
labs(x = "") +
scale_x_continuous(breaks = 1:8,
labels = c("Obs Mean", "SLR", "Poly",
"Spline", "P-Spline", "Kernel",
"Smooth", "MLR"))

####################################################################
Weighted and Generalized Least Squares
####################################################################
## load the nyc air quality data
####################################################################
library(alr3)
data(physics)
head(physics)
## x y SD
## 1 0.345 367 17
## 2 0.287 311 9
## 3 0.251 295 9
## 4 0.225 268 7
## 5 0.207 253 7
## 6 0.186 239 6
lm.physics.wls <- lm(y~x, weights=1/SD^2,data=physics)
summary(lm.physics.wls)
##
## Call:
## lm(formula = y ~ x, data = physics, weights = 1/SD^2)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -2.3230 -0.8842 0.0000 1.3900 2.3353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 148.473 8.079 18.38 0.0000000791 ***
## x 530.835 47.550 11.16 0.0000037104 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.657 on 8 degrees of freedom
## Multiple R-squared: 0.9397, Adjusted R-squared: 0.9321
## F-statistic: 124.6 on 1 and 8 DF, p-value: 0.00000371
lm.physics.ols <- lm(y~x, data=physics)
summary(lm.physics.ols)
##
## Call:
## lm(formula = y ~ x, data = physics)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.773 -9.319 -2.829 5.571 19.817
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 135.00 10.08 13.4 0.000000921 ***
## x 619.71 47.68 13.0 0.000001165 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.69 on 8 degrees of freedom
## Multiple R-squared: 0.9548, Adjusted R-squared: 0.9491
## F-statistic: 168.9 on 1 and 8 DF, p-value: 0.000001165
####################################################################
Longitudinal Data Analysis
####################################################################
## plot of CD4 data
#
# website for data
# http://www.biostat.jhsph.edu/~fdominic/teaching/bio655/data/data.html
####################################################################
data = foreign::read.dta("cd4.dta")
data = data %>% dplyr::rename(ID = id, month = time, cd4 = count)
ggplot(data, aes(x = month, y = cd4)) + geom_point()

ggplot(data, aes(x = month, y = cd4, group = ID)) +
geom_point() +
geom_line(alpha = .4)

ggplot(data, aes(x = month, y = cd4, group = ID)) +
geom_point() +
geom_line(alpha = .4) +
geom_line(data = subset(data, ID %in% unique(data$ID)[1:10]),
color = "red", size = 1)

####################################################################
## pig weight data
####################################################################
## plots
data(pig.weights)
ggplot(pig.weights, aes(x = num.weeks, y = weight, group = id.num)) +
geom_point() + geom_path(alpha = .4) +
labs(x = "Week", y = "Weight")

ggplot(pig.weights, aes(x = num.weeks, y = weight, group = id.num)) +
geom_point() + geom_path(alpha = .4) +
labs(x = "Week", y = "Weight") +
stat_smooth(method = "lm", aes(group = NULL),
se = FALSE, color = "red", size = 1.1)

## random intercept model
ranmod = lmer(weight ~ (1 | id.num) + num.weeks, data = pig.weights)
pig.weights$ranmod.fit = fitted(ranmod)
ggplot(pig.weights, aes(x = num.weeks, y = weight, group = id.num)) +
geom_point() + geom_path(alpha = .4) +
labs(x = "Week", y = "Weight") +
stat_smooth(method = "lm", aes(group = NULL),
se = FALSE, color = "red", size = 1.1) +
geom_line(aes(y = ranmod.fit), color = "blue", alpha = .5)

summary(ranmod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: weight ~ (1 | id.num) + num.weeks
## Data: pig.weights
##
## REML criterion at convergence: 2033.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.7390 -0.5456 0.0184 0.5122 3.9313
##
## Random effects:
## Groups Name Variance Std.Dev.
## id.num (Intercept) 15.142 3.891
## Residual 4.395 2.096
## Number of obs: 432, groups: id.num, 48
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 19.35561 0.60314 32.09
## num.weeks 6.20990 0.03906 158.97
##
## Correlation of Fixed Effects:
## (Intr)
## num.weeks -0.324
####################################################################
## OLS fit
ggplot(pig.weights, aes(x = num.weeks, y = weight, group = id.num)) +
geom_point() + geom_path(alpha = .4) +
labs(x = "Week", y = "Weight") +
stat_smooth(method = "lm", aes(group = NULL),
se = FALSE, color = "red", size = 1.1)

## marginal model fit
marg.mod = gee::gee(weight ~ num.weeks, id = id.num,
corstr = "exchangeable", data = pig.weights)
## Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27
## running glm to get initial regression estimate
## (Intercept) num.weeks
## 19.355613 6.209896
summary(marg.mod)
##
## GEE: GENERALIZED LINEAR MODELS FOR DEPENDENT DATA
## gee S-function, version 4.13 modified 98/01/27 (1998)
##
## Model:
## Link: Identity
## Variance to Mean Relation: Gaussian
## Correlation Structure: Exchangeable
##
## Call:
## gee::gee(formula = weight ~ num.weeks, id = id.num, data = pig.weights,
## corstr = "exchangeable")
##
## Summary of Residuals:
## Min 1Q Median 3Q Max
## -11.9050926 -2.5347801 -0.1951968 2.5949074 13.1751157
##
##
## Coefficients:
## Estimate Naive S.E. Naive z Robust S.E. Robust z
## (Intercept) 19.355613 0.5983680 32.34734 0.39963854 48.43280
## num.weeks 6.209896 0.0393321 157.88366 0.09107443 68.18485
##
## Estimated Scale Parameter: 19.29006
## Number of Iterations: 1
##
## Working Correlation
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.0000000 0.7690313 0.7690313 0.7690313 0.7690313 0.7690313
## [2,] 0.7690313 1.0000000 0.7690313 0.7690313 0.7690313 0.7690313
## [3,] 0.7690313 0.7690313 1.0000000 0.7690313 0.7690313 0.7690313
## [4,] 0.7690313 0.7690313 0.7690313 1.0000000 0.7690313 0.7690313
## [5,] 0.7690313 0.7690313 0.7690313 0.7690313 1.0000000 0.7690313
## [6,] 0.7690313 0.7690313 0.7690313 0.7690313 0.7690313 1.0000000
## [7,] 0.7690313 0.7690313 0.7690313 0.7690313 0.7690313 0.7690313
## [8,] 0.7690313 0.7690313 0.7690313 0.7690313 0.7690313 0.7690313
## [9,] 0.7690313 0.7690313 0.7690313 0.7690313 0.7690313 0.7690313
## [,7] [,8] [,9]
## [1,] 0.7690313 0.7690313 0.7690313
## [2,] 0.7690313 0.7690313 0.7690313
## [3,] 0.7690313 0.7690313 0.7690313
## [4,] 0.7690313 0.7690313 0.7690313
## [5,] 0.7690313 0.7690313 0.7690313
## [6,] 0.7690313 0.7690313 0.7690313
## [7,] 1.0000000 0.7690313 0.7690313
## [8,] 0.7690313 1.0000000 0.7690313
## [9,] 0.7690313 0.7690313 1.0000000
## random effects model fit
ranmod = lmer(weight ~ (1 | id.num) + num.weeks, data = pig.weights)
pig.weights$ranmod.fit = fitted(ranmod)
ggplot(pig.weights, aes(x = num.weeks, y = weight, group = id.num)) +
geom_point() + geom_path(alpha = .4) +
labs(x = "Week", y = "Weight") +
stat_smooth(method = "lm", aes(group = NULL), se = FALSE, color = "red", size = 1.1) +
geom_line(aes(y = ranmod.fit), color = "blue", alpha = .5)

summary(ranmod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: weight ~ (1 | id.num) + num.weeks
## Data: pig.weights
##
## REML criterion at convergence: 2033.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.7390 -0.5456 0.0184 0.5122 3.9313
##
## Random effects:
## Groups Name Variance Std.Dev.
## id.num (Intercept) 15.142 3.891
## Residual 4.395 2.096
## Number of obs: 432, groups: id.num, 48
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 19.35561 0.60314 32.09
## num.weeks 6.20990 0.03906 158.97
##
## Correlation of Fixed Effects:
## (Intr)
## num.weeks -0.324
ranef = data.frame(int = ranef(ranmod)$id.num[,1])
ggplot(ranef, aes(x = int)) + geom_histogram(binwidth = 2, color = "blue", alpha = .4)

####################################################################
Random slope models
####################################################################
## pig weight data example of fixed vs random effects
####################################################################
data(pig.weights)
X.des = model.matrix( ~ -1 + as.factor(id.num) + num.weeks, data = pig.weights)
lm.fixed = lm(weight ~ -1 + X.des, data = pig.weights)
fixed = coef(lm.fixed)[1:48] - mean(coef(lm.fixed)[1:48])
ranef.mod = lmer(weight ~ (1 | id.num) + num.weeks, data = pig.weights)
random = ranef(ranef.mod)$id.num[,1]
plot.dat = data.frame(value = c(fixed, random),
yval = rep(c(1,0), each = 48),
pair = rep(1:48, 2))
ggplot(plot.dat, aes(x = value, y = yval, group = pair)) +
geom_line() +
geom_point() +
geom_vline(xintercept = 0, color = "red", size = 1.2) +
theme_bw() +
scale_y_continuous(breaks = c(0,1), labels = c("random", "fixed")) +
labs(y = "", x = "")

####################################################################
## pig weight data
####################################################################
data(pig.weights)
## plot of data
## random effects model fit
ranmod = lmer(weight ~ (1 | id.num) + num.weeks, data = pig.weights)
pig.weights$ranmod.fit = fitted(ranmod)
ggplot(pig.weights, aes(x = num.weeks, y = weight, group = id.num)) +
geom_point() + geom_path(alpha = .4) +
labs(x = "Week", y = "Weight") +
stat_smooth(method = "lm", aes(group = NULL),
se = FALSE, color = "red", size = 1.1) +
geom_line(aes(y = ranmod.fit), color = "blue", alpha = .5)

summary(ranmod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: weight ~ (1 | id.num) + num.weeks
## Data: pig.weights
##
## REML criterion at convergence: 2033.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.7390 -0.5456 0.0184 0.5122 3.9313
##
## Random effects:
## Groups Name Variance Std.Dev.
## id.num (Intercept) 15.142 3.891
## Residual 4.395 2.096
## Number of obs: 432, groups: id.num, 48
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 19.35561 0.60314 32.09
## num.weeks 6.20990 0.03906 158.97
##
## Correlation of Fixed Effects:
## (Intr)
## num.weeks -0.324
ranef = data.frame(int = ranef(ranmod)$id.num[,1])
ggplot(ranef, aes(x = int)) +
geom_histogram(binwidth = 2, color = "blue", alpha = .4)

## random slope model fit
ranmod = lmer(weight ~ (1 + num.weeks | id.num) + num.weeks, data = pig.weights)
pig.weights$ranmod.fit = fitted(ranmod)
ggplot(pig.weights, aes(x = num.weeks, y = weight, group = id.num)) +
geom_point() + geom_path(alpha = .4) +
labs(x = "Week", y = "Weight") +
stat_smooth(method = "lm", aes(group = NULL),
se = FALSE, color = "red", size = 1.1) +
geom_line(aes(y = ranmod.fit), color = "blue", alpha = .5)

ranef = data.frame(int = ranef(ranmod)$id.num[,1],
slope = ranef(ranmod)$id.num[,2])
ggplot(ranef, aes(x = int, y = slope)) + geom_point()

####################################################################
## CD4 data example
####################################################################
## load and plot data
cd4 = foreign::read.dta("cd4.dta")
cd4 = cd4 %>% dplyr::rename(ID = id, month = time, cd4 = count)
IDS = unique(cd4$ID)[which(table(cd4$ID) != 1)]
cd4 = cd4 %>% filter(ID %in% IDS)
ggplot(cd4, aes(x = month, y = cd4, group = ID)) +
geom_point() +
geom_line(alpha = .4)

## OLS models
## linear first
lin.mod = lm(cd4 ~ month, data = cd4)
ggplot(data, aes(x = month, y = cd4, group = ID)) +
geom_point() + geom_line(alpha = .4) +
stat_smooth(method = "lm", aes(group = NULL),
se = FALSE, color = "red", size = 1.1)

summary(lin.mod)
##
## Call:
## lm(formula = cd4 ~ month, data = cd4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -734.49 -252.26 -65.36 174.91 2322.91
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 839.398 8.147 103.03 <0.0000000000000002 ***
## month -89.027 3.965 -22.45 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 362.9 on 2369 degrees of freedom
## Multiple R-squared: 0.1754, Adjusted R-squared: 0.1751
## F-statistic: 504.1 on 1 and 2369 DF, p-value: < 0.00000000000000022
AIC(lin.mod)
## [1] 34682.64
## spline
bs.mod = lm(cd4 ~ bs(month, 5), data = cd4)
ggplot(cd4, aes(x = month, y = cd4, group = ID)) +
geom_point() + geom_line(alpha = .4) +
stat_smooth(method = "lm", formula = y ~ bs(x, 5),
aes(group = NULL), se = FALSE,
color = "red", size = 1.1)

summary(bs.mod)
##
## Call:
## lm(formula = cd4 ~ bs(month, 5), data = cd4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -806.49 -238.46 -61.03 170.08 2261.82
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 898.19 57.65 15.580 < 0.0000000000000002 ***
## bs(month, 5)1 181.75 101.47 1.791 0.07340 .
## bs(month, 5)2 154.21 57.27 2.693 0.00713 **
## bs(month, 5)3 -544.31 84.28 -6.459 0.000000000128 ***
## bs(month, 5)4 -230.25 80.16 -2.873 0.00411 **
## bs(month, 5)5 -400.92 98.69 -4.063 0.000050130841 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 356.2 on 2365 degrees of freedom
## Multiple R-squared: 0.2068, Adjusted R-squared: 0.2051
## F-statistic: 123.3 on 5 and 2365 DF, p-value: < 0.00000000000000022
AIC(bs.mod)
## [1] 34598.69
## random intercept
ranint.mod = lmer(cd4 ~ (1 | ID) + month, data = cd4)
cd4$ranint.mod = fitted(ranint.mod)
fitted = data.frame(month = cd4$month,
fitted = getME(ranint.mod,'X') %*% fixef(ranint.mod))
ggplot(cd4, aes(x = month, y = cd4, group = ID)) +
geom_point() + geom_line(alpha = .2) +
geom_line(data = fitted, aes(group = NULL, y = fitted),
color = "red", size = 1.1) +
geom_line(aes(y = ranint.mod), color = "blue", alpha = .5)

summary(ranint.mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: cd4 ~ (1 | ID) + month
## Data: cd4
##
## REML criterion at convergence: 33747.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.6240 -0.5538 -0.0664 0.4593 7.3603
##
## Random effects:
## Groups Name Variance Std.Dev.
## ID (Intercept) 64982 254.9
## Residual 66532 257.9
## Number of obs: 2371, groups: ID, 364
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 838.925 14.725 56.97
## month -99.703 3.449 -28.91
##
## Correlation of Fixed Effects:
## (Intr)
## month -0.151
AIC(ranint.mod)
## [1] 33755.67
## random slope
ranslope.mod = lmer(cd4 ~ (1 + month | ID) + month, data = cd4)
cd4$ranslope.mod = fitted(ranslope.mod)
fitted = data.frame(month = cd4$month,
fitted = getME(ranslope.mod,'X') %*% fixef(ranslope.mod))
ggplot(cd4, aes(x = month, y = cd4, group = ID)) +
geom_point() + geom_line(alpha = .2) +
geom_line(data = fitted, aes(group = NULL, y = fitted),
color = "red", size = 1.1) +
geom_line(aes(y = ranslope.mod), color = "blue", alpha = .5)

summary(ranslope.mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: cd4 ~ (1 + month | ID) + month
## Data: cd4
##
## REML criterion at convergence: 33601
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.4726 -0.5122 -0.0678 0.4270 7.4983
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ID (Intercept) 70529 265.57
## month 5079 71.27 -0.40
## Residual 54224 232.86
## Number of obs: 2371, groups: ID, 364
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 840.524 15.224 55.21
## month -106.416 5.207 -20.44
##
## Correlation of Fixed Effects:
## (Intr)
## month -0.339
AIC(ranslope.mod)
## [1] 33612.96
## random intercept, slope + bs
ranbs.mod = lmer(cd4 ~ (1 + month | ID) + bs(month, 5), data = cd4)
cd4$ranbs.mod = fitted(ranbs.mod)
fitted = data.frame(month = cd4$month,
fitted = getME(ranbs.mod,'X') %*% fixef(ranbs.mod))
ggplot(cd4, aes(x = month, y = cd4, group = ID)) +
geom_point() + geom_line(alpha = .2) +
geom_line(data = fitted, aes(group = NULL, y = fitted),
color = "red", size = 1.1) +
geom_line(aes(y = ranbs.mod), color = "blue", alpha = .5)

summary(ranbs.mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: cd4 ~ (1 + month | ID) + bs(month, 5)
## Data: cd4
##
## REML criterion at convergence: 33408.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.6873 -0.5043 -0.0494 0.4443 7.5187
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ID (Intercept) 71333 267.08
## month 4851 69.65 -0.43
## Residual 50484 224.69
## Number of obs: 2371, groups: ID, 364
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 910.11 50.09 18.169
## bs(month, 5)1 157.54 73.43 2.145
## bs(month, 5)2 164.87 47.42 3.477
## bs(month, 5)3 -588.42 64.43 -9.133
## bs(month, 5)4 -264.07 65.57 -4.027
## bs(month, 5)5 -609.54 82.14 -7.421
##
## Correlation of Fixed Effects:
## (Intr) b(,5)1 b(,5)2 b(,5)3 b(,5)4
## bs(mnth,5)1 -0.788
## bs(mnth,5)2 -0.791 0.485
## bs(mnth,5)3 -0.835 0.831 0.471
## bs(mnth,5)4 -0.652 0.404 0.738 0.332
## bs(mnth,5)5 -0.632 0.574 0.401 0.703 0.223
summary(ranbs.mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: cd4 ~ (1 + month | ID) + bs(month, 5)
## Data: cd4
##
## REML criterion at convergence: 33408.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.6873 -0.5043 -0.0494 0.4443 7.5187
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ID (Intercept) 71333 267.08
## month 4851 69.65 -0.43
## Residual 50484 224.69
## Number of obs: 2371, groups: ID, 364
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 910.11 50.09 18.169
## bs(month, 5)1 157.54 73.43 2.145
## bs(month, 5)2 164.87 47.42 3.477
## bs(month, 5)3 -588.42 64.43 -9.133
## bs(month, 5)4 -264.07 65.57 -4.027
## bs(month, 5)5 -609.54 82.14 -7.421
##
## Correlation of Fixed Effects:
## (Intr) b(,5)1 b(,5)2 b(,5)3 b(,5)4
## bs(mnth,5)1 -0.788
## bs(mnth,5)2 -0.791 0.485
## bs(mnth,5)3 -0.835 0.831 0.471
## bs(mnth,5)4 -0.652 0.404 0.738 0.332
## bs(mnth,5)5 -0.632 0.574 0.401 0.703 0.223
AIC(ranbs.mod)
## [1] 33428.49
####################################################################
Multilevel models
####################################################################
## two simulated data examples
####################################################################
I = 100
J = 5
Z.des = matrix(c(rep(1, J), rep(0, I*J)), nrow = I*J, ncol = I)
## Warning in matrix(c(rep(1, J), rep(0, I * J)), nrow = I * J, ncol = I):
## data length [505] is not a sub-multiple or multiple of the number of rows
## [500]
bi = rnorm(I, 0, 25)
eps = rnorm(I*J, 0, 1)
yij = Z.des%*% bi + eps
plot(Z.des%*% bi + eps, xlab = "ij", ylab = expression(y_[ij]), pch = 18)

bi = rnorm(I, 0, 1)
yij = Z.des%*% bi + eps
plot(Z.des%*% bi + eps, xlab = "ij", ylab = expression(y_[ij]), pch = 18)

subjs = Z.des %*% (1:I)
lmer(yij ~ (1 | subjs))
## Linear mixed model fit by REML ['lmerMod']
## Formula: yij ~ (1 | subjs)
## REML criterion at convergence: 1668.834
## Random effects:
## Groups Name Std.Dev.
## subjs (Intercept) 0.8999
## Residual 1.1077
## Number of obs: 500, groups: subjs, 100
## Fixed Effects:
## (Intercept)
## -0.191
####################################################################
## three level data
####################################################################
set.seed(121)
I = 20
J = 10
K = 10
ZI.des = matrix(c(rep(1, J*K), rep(0, I*J*K)), nrow = I*J*K, ncol = I)
## Warning in matrix(c(rep(1, J * K), rep(0, I * J * K)), nrow = I * J * K, :
## data length [2100] is not a sub-multiple or multiple of the number of rows
## [2000]
ZIJ.des = matrix(c(rep(1, K), rep(0, I*J*K)), nrow = I*J*K, ncol = I*J)
## Warning in matrix(c(rep(1, K), rep(0, I * J * K)), nrow = I * J * K, ncol =
## I * : data length [2010] is not a sub-multiple or multiple of the number of
## rows [2000]
bi = rnorm(I, 0, 50)
bij = rnorm(I*J, 0, 25)
eps = rnorm(I*J, 0, 1)
yij = ZI.des %*% bi + ZIJ.des%*% bij + eps
dev.new(width = 7, height = 3.5)
par(mai = c(.7,.7,.2,.2))
plot(yij, xlab = "ij", ylab = expression(y_[ij]), pch = 18)
abline(v = c(J*K * (0:I)), lty = 2)
L1 = ZI.des %*% (1:I)
L2 = ZIJ.des %*% (1:(I*J))
nested.mod = lmer(yij ~ (1 | L1) + (1 | L2))
summary(nested.mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: yij ~ (1 | L1) + (1 | L2)
##
## REML criterion at convergence: 7464.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.84231 -0.67622 -0.02368 0.67047 2.54767
##
## Random effects:
## Groups Name Variance Std.Dev.
## L2 (Intercept) 527.003 22.957
## L1 (Intercept) 2137.465 46.233
## Residual 1.004 1.002
## Number of obs: 2000, groups: L2, 200; L1, 20
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.7712 10.4646 0.074
####################################################################
## effect of bayes
####################################################################
set.seed(2008)
grid = seq(-5, 25, length = 1000)
n = 10
l.var = 25
l.mean = 20
y = rnorm(n, mean = l.mean, sd = sqrt(l.var))
like = dnorm(grid, mean = mean(y), sd = sqrt(l.var/n))
p.var = 50
p.mean = 2
prior = dnorm(grid, mean = p.mean, sd = sqrt(p.var))
post.mean = mean(y) * p.var / (p.var + l.var/n) + p.mean * (l.var/n) / (p.var + l.var/n)
post.var = (p.var * l.var / n) / (p.var + l.var/n)
post = dnorm(grid, mean = post.mean, sd = sqrt(post.var))
plot(grid, post, type = 'l', lwd = 3, xlab = expression(mu), ylab = expression(p(mu)))
points(grid, prior, type = 'l', col = 2, lwd = 3)
points(grid, like, type = 'l', col = 4, lwd = 3)
####################################################################
Measurement error; mediation; confounding
####################################################################
## simulate a simple linear regression with measurement error
####################################################################
## generate data
set.seed(14)
x = rnorm(30, 3, 3)
w = x + rnorm(30,0,3)
y = 2+.6*x +rnorm(30, 0, 1)
## fit regressions
linmod.x = lm(y~x)
linmod.w = lm(y~w)
## plot data
par(mai = c(.1, .1, .1, .1))
plot(x,y, axes = FALSE, col = "black", pch = 19,
xlab = "", ylab = "", xlim = range(c(x, w)))
points(w,y, col = "red", pch = 19)
abline(h = 0, lwd = 2); abline(v = 0, lwd = 2)
abline(linmod.x, col = "blue", lwd = 2)
abline(linmod.w, col = "blue", lwd = 2, lty = 2)

####################################################################
## simulate a regression calibration model
####################################################################
## generate data
set.seed(14)
x = rnorm(30, 3, 3)
w = x + rnorm(30,0,3)
y = 2+.6*x +rnorm(30, 0, 1)
z = x + rnorm(30,0,2)
fitted = fitted(lm(w~z))
## fit regressions
linmod.x = lm(y~x)
linmod.w = lm(y~w)
linmod.fitted = lm(y~fitted)
## plot data
par(mai = c(.1, .1, .1, .1))
plot(x,y, axes = FALSE, col = "black", pch = 19,
xlab = "", ylab = "", xlim = range(c(x, w)))
points(w,y, col = "red", pch = 19)
abline(h = 0, lwd = 2); abline(v = 0, lwd = 2)
abline(linmod.x, col = "blue", lwd = 2)
abline(linmod.w, col = "blue", lwd = 2, lty = 2)
abline(linmod.fitted, col = "blue", lwd = 2, lty = 2)

####################################################################
## perform SIMEX
####################################################################
## generate data
set.seed(14)
sig2u = 9
x = rnorm(30, 3, 3)
w = x + rnorm(30, 0, sqrt(sig2u))
y = 2+.6*x +rnorm(30, 0, 1)
lam = seq(.5, 5, by = .5)
beta.lam = matrix(NA, nrow = 1000, ncol = length(lam))
for(l in 1:length(lam)){
for(i in 1:1000){
wb = w + rnorm(30, 0, sqrt(lam[l] * sig2u))
beta.lam[i,l] = coef(lm(y ~ wb))[2]
}
}
lam = c(0, lam)
mean.beta = c(coef(lm(y ~ w))[2], apply(beta.lam,2,mean))
lam2 = lam^2
lam3 = lam^3
simex.coef = coef(lm(mean.beta ~ lam + lam2 + lam3))
new.lam = seq(-1, 5, by = .1)
simex.fit = cbind(1, new.lam, new.lam^2, new.lam^3) %*% simex.coef
quartz(width = 6, height = 4)
par(mai = c(.8, .8, .1, .1))
plot(lam, mean.beta,
xlim = range(new.lam),
ylim = range(0, simex.fit), pch = 19,
col = c(2, rep(1, length(lam)-1 )),
xlab = expression(lambda),
ylab = expression(beta[1]))
points(new.lam, simex.fit, type = 'l', lty = 2)

####################################################################
Logistic regression
####################################################################
## simulate some binary data
####################################################################
set.seed(11201)
beta0 = 1
beta1 = .75
x = rnorm(100, 0, 3)
pi = inv.logit(beta0 + beta1 *x)
y = rbinom(100, 1, pi)
data = data.frame(x = x, y = y)
## linear model plot
ggplot(data, aes(x = x, y = y)) + geom_point() +
theme_bw() + ylim(-.25, 1.25) +
stat_smooth(method = "lm", se = FALSE)
## Warning: Removed 10 rows containing missing values (geom_smooth).

####################################################################
## logistic regression
####################################################################
model = glm(y~x, family = binomial(link = "logit"), data = data)
summary(model)
##
## Call:
## glm(formula = y ~ x, family = binomial(link = "logit"), data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9360 -0.4631 0.1561 0.5564 1.8131
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.1072 0.3357 3.298 0.000974 ***
## x 0.8097 0.1664 4.865 0.00000115 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 129.49 on 99 degrees of freedom
## Residual deviance: 73.24 on 98 degrees of freedom
## AIC: 77.24
##
## Number of Fisher Scoring iterations: 6
data = mutate(data, fitted = fitted(model),
fitted_logit = logit(fitted))
p1 = ggplot(data, aes(x = x, y = y)) + geom_point() +
theme_bw() + ylim(-.25, 1.25) +
geom_line(aes(x = x, y = fitted), color = "blue")
p2 = ggplot(data, aes(x = x, y = fitted_logit)) + geom_line(color = "blue") +
theme_bw()
grid.arrange(p1, p2, nrow = 1, ncol = 2)

####################################################################